In [49]:
import nltk
import numpy as np
import tokenize
from sklearn import preprocessing as prep
from nltk.tokenize import *
from nltk.corpus import stopwords as st
from string import punctuation
import cleantext
import math
from nltk.stem import WordNetLemmatizer
# nltk.download('omw-1.4')

In [50]:
docs = []
posting_list = []     # [word, number of repitition, [docIDs]]
sw = 0                # a flag to know if the program should remove stopwords or not
lem = 0               # a flag to know if the program should lemmatize tokens or not

In [51]:
# define each node of linked list

class Node:
    
    def __init__(self, data, next=None): 
        self.data = data
        self.next = next

In [52]:
class LinkedList:
    
    def __init__(self):
        self.head = None


    # insertion method for the linked list
    def insert(self, data):
        newNode = Node(data)
        if(self.head):
            current = self.head
            while(current.next):
                current = current.next
            current.next = newNode
        else:
            self.head = newNode
    

    # search if list contains a specific value
    def search(self, value):
        current = self.head
        while(current):
            if current.data == value:
                return True
            current = current.next


    # print method for the linked list
    def print(self):
        current = self.head
        while(current):
            if current.next:
                print(str(current.data) + "->", end =" ")
                current = current.next
            else:
                print(current.data)
                current = current.next

In [53]:
# define a new class for documents

class Document:

    def __init__(self, ID) -> None:
        self.id = int(ID)                  # .I
        self.title = ""                    # .T
        self.abstract = ""                 # .W
        self.pub_date = 0                  # .B
        self.author = list()               # .A
        self.info = ""                     # .N
        self.ref = list()                  # .X

    def set_title(self, title):
        self.title = title
    
    def set_abstract(self, abstract):
        self.abstract += abstract

    def set_pubdate(self, pubdate):
        self.pubdate = int(pubdate)

    def set_author(self, author):
        self.author.append(author)

    def set_info(self, info):
        self.info += info

    def set_ref(self, ref):
        self.ref.append(ref)

    def get_id(self):
        return self.id

    def get_title(self):
        return self.title

    def get_abstract(self):
        return self.abstract

    def get_pubdate(self):
        return self.pub_date

    def get_author(self):
        return self.author

    def get_info(self):
        return self.info

    def get_ref(self):
        return self.ref
    

In [54]:
#read CISI.ALL file and fill lists

def read_from_file():

    # some flags for reading file properly
    t = 0
    a = 0
    w = 0
    x = 0
    n = 0
    b = 0
    
    with open("CISI.ALL", encoding = "latin-1") as all_files:
        current_doc = None
        for line in all_files.readlines():
            if line.startswith(".I"):
                t = 0
                a = 0
                w = 0
                x = 0
                n = 0
                b = 0
                id = int(line[3])
                current_doc = Document(id)
                docs.append(current_doc)
            elif line.startswith(".T"):
                t = 1
                a = 0
                w = 0
                x = 0
                n = 0
                b = 0
            elif line.startswith(".W"):
                t = 0
                a = 0
                w = 1
                x = 0
                n = 0
                b = 0
            elif line.startswith(".B"):
                t = 0
                a = 0
                w = 0
                x = 0
                n = 0
                b = 1
            elif line.startswith(".N"):
                t = 0
                a = 0
                w = 0
                x = 0
                n = 1
                b = 0
            elif line.startswith(".X"):
                t = 0
                a = 0
                w = 0
                x = 1
                n = 0
                b = 0
            elif line.startswith(".A"):
                t = 0
                a = 1
                w = 0
                x = 0
                n = 0
                b = 0
            elif t == 1:
                # title
                current_doc.set_title(cleantext.clean(line.lower(),normalize_whitespace=True))
                t = 0
            elif a == 1:
                # author
                current_doc.set_author(cleantext.clean(line.lower(),normalize_whitespace=True))
            elif b == 1:
                # publication date
                current_doc.set_pubdate(line)
            elif w == 1:
                # abstract
                current_doc.set_abstract(cleantext.clean(line.lower(),normalize_whitespace=True))
            elif x == 1:
                # references
                current_doc.set_ref(line)
            elif n == 1:
                # info
                current_doc.set_info(cleantext.clean(line.lower(),normalize_whitespace=True))
            else:
                continue

            
    all_files.close()


In [55]:
# building posting list

def preprocessing(): 
    tokens = []
    normalized = []
    stopwords = st.words("english")
    lemmatizer = WordNetLemmatizer()
    
    # tokenization
    for doc in docs:
        abstract = doc.get_abstract()
        tokenizer = TweetTokenizer()
        tokens = tokenizer.tokenize(abstract)
        
        # remove stopwords
        if sw == 1:
            for token in tokens:
                if (any(token in row for row in posting_list)):
                    for row in posting_list:
                        if token in row:
                            row_idx = posting_list.index(row)
                            linked_list = posting_list[row_idx][2]
                            if (not linked_list.search(int(doc.get_id()))):
                                linked_list.insert(int(doc.get_id()))
                            posting_list[row_idx][1] += 1
                            posting_list[row_idx][2] = linked_list
                            break

                # first appearance of a word
                elif ((token not in stopwords) & (token not in punctuation)):
                    # lemmatizing
                    if lem == 1:
                        token = lemmatizer.lemmatize(token)
                        
                    linked_list = LinkedList()
                    linked_list.insert(int(doc.get_id()))
                    posting_list.append([token, 1, linked_list])
                    
        elif sw == 0:
            for token in tokens:
                if (any(token in row for row in posting_list)):
                    for row in posting_list:
                        if token in row:
                            row_idx = posting_list.index(row)
                            linked_list = posting_list[row_idx][2]
                            if (not linked_list.search(int(doc.get_id()))):
                                linked_list.insert(int(doc.get_id()))
                            posting_list[row_idx][1] += 1
                            posting_list[row_idx][2] = linked_list
                            break

                # first appearance of a word
                else:
                    # lemmatizing
                    if lem == 1:
                        token = lemmatizer.lemmatize(token)
                        
                    linked_list = LinkedList()
                    linked_list.insert(int(doc.get_id()))
                    posting_list.append([token, 1, linked_list])
                    


In [56]:
# a method for printing posting list

def print_posting_list():
    for row in posting_list:
        linked_list = row[2]
        print("token: " + str(row[0]) + " " + ", repetition: " + str(row[1]))
        linked_list.print()
        

In [57]:
# TF

def tf(posting_list):
    row = 0
    tf = []
    
    for element in posting_list:
        if row <= 10:
            tf.append(element[1]/len(posting_list))
            row += 1
        else:
            break
        
    return tf


In [58]:
# TF*IDF

def tf_idf(posting_list):
    row = 0
    tf_idf = []
    
    for element in posting_list:
        if row <= 10:
            tf_idf.append((element[1]/len(posting_list) * math.log10(len(posting_list)/element[1])))
            row += 1
        else:
            break
        
    return tf_idf


In [59]:
# test
read_from_file()
sw = 0
lem = 1
preprocessing()
tf = tf(posting_list)
tf_idf = tf_idf(posting_list)
print(tf)
print(tf_idf)
# print_posting_list()

[0.2, 0.002527646129541864, 0.005845181674565561, 0.04439178515007899, 0.06682464454976303, 0.0007898894154818325, 0.16856240126382308, 0.000315955766192733, 0.0001579778830963665, 0.0736176935229068, 0.002843601895734597]
[0.13979400086720378, 0.006565014160787185, 0.013053471323880463, 0.0600486519149367, 0.07852319019552895, 0.0024505795463517664, 0.1303390715738718, 0.001105963258879423, 0.0006005377109032156, 0.08341015666517336, 0.007240183521082604]
