## 2.Unigram Inverted Index 

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RaKaN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RaKaN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import os
import string
import copy
import pickle

In [3]:
title = "20_newsgroups"
os.chdir("D:/20_newsgroups")
paths= []
for (dirpath, dirnames, filenames) in os.walk(str(os.getcwd())+'/'+title+'/'): 
    for i in filenames: 
        paths.append(str(dirpath)+str("\\")+i)
print(dirpath)
paths[0]

D:\20_newsgroups/20_newsgroups/alt.atheism


'D:\\20_newsgroups/20_newsgroups/alt.atheism\\49960'

# 2.1 Preprocessing 

## Removing stop words

In [4]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

##  Removing punctuation 

In [5]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], '')
        data = np.char.replace(data, " ", " ")
    data = np.char.replace(data, ',', '')
    return data

## Convert to lowercase 

In [6]:
def convert_lower_case(data):
    return np.char.lower(data) 

## Stemming

In [7]:
def stemming(data):
    stemmer= PorterStemmer()
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return np.char.strip(new_text)

## Converting numbers to its equivalent words

In [8]:
def convert_numbers(data):
    data = np.char.replace(data, "0", " zero ")
    data = np.char.replace(data, "1", " one ")
    data = np.char.replace(data, "2", " two ")
    data = np.char.replace(data, "3", " three ")
    data = np.char.replace(data, "4", " four ")
    data = np.char.replace(data, "5", " five ")
    data = np.char.replace(data, "6", " six ")
    data = np.char.replace(data, "7", " seven ")
    data = np.char.replace(data, "8", " eight ")
    data = np.char.replace(data, "9", " nine ")
    return data

## Removing header 

In [9]:
def remove_header(data):
    try:
        ind = data.index('\n\n')
        data = data[ind:]
    except:
        print("No Header")
    return data

## Removing apostrophe

In [10]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

## Removing single characters

In [11]:
def remove_single_characters(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

# Exercise 1

In [12]:
def preprocess(data, query):
    if not query:
        data = remove_header(data) 
        data = convert_lower_case(data)
        data = convert_numbers(data)
        data = remove_punctuation(data)
        data = remove_stop_words(data)
        data = remove_apostrophe(data)
        data = remove_single_characters(data)
        data = stemming(data) 
    return data

## Generating Postings for Unigram inverted index 

In [13]:
doc = 0
postings = pd.DataFrame()

for path in paths:
    file = open(path, 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    preprocessed_text = preprocess(text, False)
    
    if doc%100 == 0:
        print(doc)

    tokens = word_tokenize(str(preprocessed_text))
    for token in tokens:
        if token in postings:
            p = postings[token][0]
            p.add(doc)
            postings[token][0] = p
        else:
            postings.insert(value=[{doc}], loc=0, column=token)
    doc += 1 

0


In [15]:
postings.to_pickle(title + "_unigram_postings") 

In [16]:
postings

Unnamed: 0,imight,exam,ofit,acompil,side,uneven,qualityi,inpubsocreligionchristianotherscontradict,ftprutgersedu,mcdowelldjp,...,atheist,zero,version,two,nine,decemb,one,resourceslastmodifi,atheismresourcesaltatheismarchivenam,archivenam
0,{21},{21},{21},{21},{21},{21},{21},{21},{21},{21},...,"{0, 1, 5}","{0, 1, 2, 4, 5, 6, 10, 15, 17, 18, 19, 20, 21}","{0, 1, 2}","{0, 1, 2, 4, 5, 7, 17, 18, 19, 20}","{0, 1, 2, 4, 5, 17, 18, 19}",{0},"{0, 1, 2, 3, 4, 5, 7, 13, 14, 16, 17, 18, 19, ...",{0},{0},"{0, 1}"


## Search Query for Unigram Index 

#### Read first the stored posting list

In [31]:
postings = pd.read_pickle(title + "_unigram_postings") 


# Exercise 2

In [52]:
print("Word is ","one")
print("Posting list :",postings["one"][0])
print("Frequancy :",len(postings["one"][0]))

Word is  one
Posting list : {0, 1, 2, 3, 4, 5, 7, 13, 14, 16, 17, 18, 19, 20, 21}
Frequancy : 15


In [51]:
print("Word is ","nine")
print("Posting list :",postings["nine"][0])
print("Frequancy :",len(postings["nine"][0]))

Word is  nine
Posting list : {0, 1, 2, 4, 5, 17, 18, 19}
Frequancy : 8


In [149]:
print("Word is ","exam")
print("Posting list :",postings["exam"][0])
print("Frequancy :",len(postings["exam"][0]))

Word is  exam
Posting list : {21}
Frequancy : 1


In [175]:
def get_word_postings(word):
    preprocesses_word = str(preprocess(word, True))
    print(preprocesses_word)
    print("Frequency:",len(postings[preprocesses_word][0]))
    print("Postings List:",(postings[preprocesses_word][0]))
get_word_postings("nine")

nine
Frequency: 8
Postings List: {0, 1, 2, 4, 5, 17, 18, 19}


In [176]:
def get_not(word):
    a = postings[word][0]
    b = set(range(len(paths)))
    return b.difference(a)
get_not("nine")

{3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}

In [205]:
def generate_command_tokens(query):
    query = query.lower()
    tokens = word_tokenize(query)
    
    commands = []
    query_words = []
    
    for t in tokens:
        if t not in ['and','or','not']:
            processed_word = preprocess([t], True)
            print(str(processed_word))
            query_words.append(str(processed_word))
        else:
            commands.append(t)
            
    return commands,query_words

In [206]:
def gen_not_tuple(query_words, commands):
    tup=[]
    while 'not' in commands:
        i = commands.index('not')
        word = query_words[i]
        word_postings - get_not(word)
        tup.append(word_postings)
        commands.pop(i)
        query_words[i] = i
        print("\nAfter Not Processing : ",commands, query_words)
    return tup
def binary_operations(query_words , commands , tup):
    a = postings[query_words[0]][0]
    query_words.pop(0)
    
    for i in range(len(commands)):
        if type (query_words[i])==int:
            b = tup.pop(0)
        else:
            b = postings[query_words[i]][0]
            
            if commands[i] == 'and':
                a = a.intersection(b)
            elif commands[i] == 'or':
                 a = a.union(b)
            else:
                print("Invalid Command")
    return a



In [207]:
def execute_query(query):
    
    commands, query_words = generate_command_tokens(query)
    tup = gen_not_tuple(query_words, commands)
    
    print("\nCommands :",commands)
    print("\nQuery Words :",query_words)
    print("\nTup :",len(tup))
    
    final_set = binary_operations(query_words , commands , tup)
    
    print("\nFinal Set :",final_set)
    return final_set

def print_file(file):
    out_file = open(paths[file], 'r', encoding = 'cp1250')
    out_text = out_file.read()
    print(out_text)

In [208]:
query = "exam and nine"

In [209]:
lists = exeucute_query(query)

['exam']
['nine']

Commands : ['and']

Query words : ["['exam']", "['nine']"]

tup : 0


KeyError: "['exam']"