# Importing Libraries
## PorterStemmer
Stemming algorithm.

In [1]:
import os
import re
import json
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt

# Parser
returns dictionary named "documents". indexes are the document ids. 
documents[index] = {'title', 'abstract', 'date', 'authors'}
We use dictionary, because we can search and get document details with O(1) complexity.

In [23]:
def parser(myPath = "./cacm/cacm.all"):
    documents = {}
    my_file = open(myPath,'r',encoding='windows-1252')
    while True: 
        line = my_file.readline() 
        if not line: 
            break
        elif line.startswith('.I'):
            mode = 'i'
            index = line.split(' ')[-1:][0]
            index = index.replace('\n', '')
        elif line.startswith('.T'):
            mode = 't'
        elif line.startswith('.W'):
            mode = 'w'
        elif line.startswith('.B'):
            mode = 'b'
        elif line.startswith('.A'):
            mode = 'a'
        elif line.startswith('.'):
            mode = 'z'
        else:
            if mode=='t':
                documents[index] = {'title': line.replace('\n', ''), 'abstract': '', 'date': '', 'authors': ''}
            elif mode=='w':
                documents[index]['abstract'] += line.replace('\n', '')
            elif mode=='b':
                documents[index]['date'] = line.replace('\n', '')
            elif mode=='a':
                documents[index]['authors'] += line.replace('\n', '')
            elif mode=='z':
                continue
    my_file.close() 
    return documents

### Merge title and abstract parts into new "terms" value

In [25]:
for key in documents:
    documents[key]['terms'] = documents[key]['title'] + documents[key]['abstract']

# Preprocessing
## Tokenizing
## Removing stopwords
## Stemming

In [37]:
def preprocessing(documents):
    # Tokenizing using list comprehension and regular expressions 
    tokenizing = [re.findall('\w+',documents[documentId]["terms"]) for documentId in documents]
    token_length = 0
    for doc in tokenizing:
        token_length+=len(doc)
    print(f'Number of tokens before preprocessing: {token_length}')
    # Open stopwords file
    stop_words = open("./stopwords.txt",'r',encoding='windows-1252')
    stop_words = stop_words.read()
    stop_words = stop_words.split()
    stopWords_removed = []
    stems = []
    finished_dic = {} 
    # Remove stopwords using a list comprehension
    for doc in tokenizing:
        doc = [d for d in doc if d not in stop_words]
        stopWords_removed.append(doc)
    
    stopWord_length = 0
    for doc in stopWords_removed:
        stopWord_length += len(doc)
    print(f'Number of tokens after removing stop words: {stopWord_length}')
    # Using Porter Stemmer algorithm
    porter = PorterStemmer()
    for doc in stopWords_removed:
        doc = [porter.stem(s) for s in doc]
        stems.append(doc)
    
    stem_length = 0
    for doc in stems:
        stem_length+=len(doc)
    print(f'Number of tokens after stemming: {stem_length}')

    for documentId in documents:
        finished_dic[documentId] = {'terms': ''}
        finished_dic[documentId]['terms'] = ' '.join(stems[int(documentId)-1])

    return finished_dic

# Create inverted index
returns dictionary inverted_index.
#### Inverted_index:
word : {'doc_id' : documentId_list, 'token_id' : token_id, 'tf' : term_frequency}

In [87]:
def create_index(preprocessed):
    inverted_index={}
    token_id = 1
    for documentId, text in preprocessed.items():
        for word in text['terms'].lower().split():
            # If the term is in dictionary
            if inverted_index.get(word,False):
                # Add term frequency
                inverted_index[word]['tf']+=1
                if documentId not in inverted_index[word]['doc_id']:
                    inverted_index[word]['doc_id'].append(documentId)
            else:
                # If the term is not in inverted_index dictionary 
                inverted_index[word]={'doc_id':[documentId], 'token_id':token_id, 'tf':1}
                token_id+=1
                
    return inverted_index

# Parsing dataset

In [88]:
documents = parser(myPath)

In [89]:
for docId in documents:
    documents[docId]['terms'] = documents[docId]['title'] + documents[docId]['abstract'] 

# Information about tokens

In [90]:
preprocessed = preprocessing(documents)

Number of tokens before preprocessing: 161485
Number of tokens after removing stop words: 113749
Number of tokens after stemming: 113749


# Printing number of documents

In [91]:
doc_count = len(preprocessed)
print(f'Number of preprocessed documents: {doc_count}')

Number of preprocessed documents: 3204


In [92]:
inverted_index = create_index(preprocessed)
print(f'Number of words in inverted-index: {len(inverted_index)}')

Number of words in inverted-index: 13351


In [95]:
import time
durations = []
while True:
    term = input('please Enter the term: ')
    start_time = time.time()
    if term == 'ZZEND': 
        break
    print(f'Document frequency: {len(inverted_index[term]["doc_id"])}')
    for t in inverted_index[term]['doc_id']:
        print(f'Document id: {t}')
        print(f'Title: {documents[t]["title"]}')
        print(f'Term frequency: {len(re.findall(term,preprocessed[t]["terms"]))}')
        occurrences = [i.start() for i in re.finditer(term, preprocessed[t]['terms'])]
        print('occurrences: ', occurrences)
        doc = preprocessed[t]['terms']
        doc = doc.replace(term, term.upper())
        doc = doc.split(' ')
        start = doc.index(term.upper()) 
        summary = ''
        try:
            for i in range(start, start+8):
                summary += doc[i] + ' '
        except:
            pass
        print(f'Document summary: {summary}')
        durations.append(time.time()-start_time)
        print('Query execution time: ',time.time()-start_time)
sum = 0
for duration in durations:
    sum += duration
average_time = sum / len(durations)
print('Average query execution time: ', average_time)

please Enter the term: preliminari
Document frequency: 17
Document id: 1
Title: Preliminary Report-International Algebraic Language
Term frequency: 1
occurrences:  [0]
Document summary: PRELIMINARI report intern algebra languag 
Query execution time:  0.00016927719116210938
Document id: 254
Title: SMALGOL-61
Term frequency: 1
occurrences:  [178]
Document summary: PRELIMINARI report result At acm nation confer four 
Query execution time:  0.00028824806213378906
Document id: 825
Title: for the Analysis of Spark-Chamber Data*
Term frequency: 1
occurrences:  [228]
Document summary: PRELIMINARI interpret these photograph In continu oper processingr 
Query execution time:  0.0015037059783935547
Document id: 894
Title: An Iterative Factorization Technique for Polynomials
Term frequency: 1
occurrences:  [264]
Document summary: PRELIMINARI result indic well adaptedto use digit comput 
Query execution time:  0.001638650894165039
Document id: 1205
Title: An Undergraduate Program in Computer Scien