In [237]:
import nltk
import numpy as np
import tokenize
from sklearn import preprocessing as prep
from nltk.tokenize import *
from nltk.corpus import stopwords as st
from string import punctuation
import cleantext
import math
from nltk.stem import WordNetLemmatizer
# nltk.download('omw-1.4')

In [286]:
all_docs = []         # .all file docs
posting_list = []     # [word, number of repitition, [docIDs]]
query_docs = []       # .qry file docs
query_list = []       # [word, number of repitition, [docIDs]]
sw = 1                # a flag to know if the program should remove stopwords or not
lem = 1               # a flag to know if the program should lemmatize tokens or not
norm = 1              # a flag to know if the program should normalize td-idf or not

In [239]:
# define each node of linked list

class Node:
    
    def __init__(self, data, next=None): 
        self.data = data
        self.next = next

In [240]:
class LinkedList:
    
    def __init__(self):
        self.head = None


    # insertion method for the linked list
    def insert(self, data):
        newNode = Node(data)
        if(self.head):
            current = self.head
            while(current.next):
                current = current.next
            current.next = newNode
        else:
            self.head = newNode
    

    # search if list contains a specific value
    def search(self, value):
        current = self.head
        while(current):
            if current.data == value:
                return True
            current = current.next


    # count the occurance of an element in list
    def count(self, value):
        current = self.head
        count = 0
        while (current):
            if current.data == value:
                count += 1
            current = current.next
 
        return count

    # print method for the linked list
    def print(self):
        current = self.head
        while(current):
            if current.next:
                print(str(current.data) + "->", end =" ")
                current = current.next
            else:
                print(current.data)
                current = current.next

In [241]:
# define a new class for documents

class Document:

    def __init__(self, ID) -> None:
        self.id = int(ID)                  # .I
        self.title = ""                    # .T
        self.abstract = ""                 # .W
        self.pub_date = 0                  # .B
        self.author = list()               # .A
        self.info = ""                     # .N
        self.ref = list()                  # .X
        self.termCount = 0                 # total number of terms in doc
        
    def set_term_count(self, ct):
        self.termCount = ct

    def set_title(self, title):
        self.title = title
    
    def set_abstract(self, abstract):
        self.abstract += abstract

    def set_pubdate(self, pubdate):
        self.pubdate = int(pubdate)

    def set_author(self, author):
        self.author.append(author)

    def set_info(self, info):
        self.info += info

    def set_ref(self, ref):
        self.ref.append(ref)

    def get_id(self):
        return self.id

    def get_title(self):
        return self.title

    def get_abstract(self):
        return self.abstract

    def get_pubdate(self):
        return self.pub_date

    def get_author(self):
        return self.author

    def get_info(self):
        return self.info

    def get_ref(self):
        return self.ref
    
    def set_term_count(self):
        return self.termCount

    

In [242]:
#read CISI.ALL file and fill lists

def read_from_file(name):
    docs = []
    # some flags for reading file properly
    t = 0
    a = 0
    w = 0
    x = 0
    n = 0
    b = 0
    
    with open(name, encoding = "latin-1") as all_files:
        current_doc = None
        for line in all_files.readlines():
            if line.startswith(".I") & (len(line) < 7):
                t = 0
                a = 0
                w = 0
                x = 0
                n = 0
                b = 0
                id = int(line[3])
                current_doc = Document(id)
                docs.append(current_doc)
            elif line.startswith(".T"):
                t = 1
                a = 0
                w = 0
                x = 0
                n = 0
                b = 0
            elif line.startswith(".W"):
                t = 0
                a = 0
                w = 1
                x = 0
                n = 0
                b = 0
            elif line.startswith(".B") & (len(line) < 3):
                t = 0
                a = 0
                w = 0
                x = 0
                n = 0
                b = 1
            elif line.startswith(".N"):
                t = 0
                a = 0
                w = 0
                x = 0
                n = 1
                b = 0
            elif line.startswith(".X"):
                t = 0
                a = 0
                w = 0
                x = 1
                n = 0
                b = 0
            elif line.startswith(".A"):
                t = 0
                a = 1
                w = 0
                x = 0
                n = 0
                b = 0
            elif t == 1:
                # title
                current_doc.set_title(cleantext.clean(line.lower(),normalize_whitespace=True))
                t = 0
            elif a == 1:
                # author
                current_doc.set_author(cleantext.clean(line.lower(),normalize_whitespace=True))
            elif b == 1:
                # publication date
                current_doc.set_pubdate(line)
            elif w == 1:
                # abstract
                current_doc.set_abstract(cleantext.clean(line.lower(),normalize_whitespace=True))
            elif x == 1:
                # references
                current_doc.set_ref(line)
            elif n == 1:
                # info
                current_doc.set_info(cleantext.clean(line.lower(),normalize_whitespace=True))
            else:
                continue

            
    all_files.close()
    
    return docs


In [271]:
# building posting list

def preprocessing(docs): 
    tokens = []
    normalized = []
    stopwords = st.words("english")
    lemmatizer = WordNetLemmatizer()
    
    # tokenization
    for doc in docs:
        abstract = doc.get_abstract()
        tokenizer = TweetTokenizer()
        tokens = tokenizer.tokenize(abstract)
        
        # remove stopwords
        if sw == 1:
            for token in tokens:
                if (any(token in row for row in posting_list)):
                    for row in posting_list:
                        if token in row:
                            row_idx = posting_list.index(row)
                            linked_list = posting_list[row_idx][2]
                            # if (not linked_list.search(int(doc.get_id()))):
                            linked_list.insert(int(doc.get_id()))
                            posting_list[row_idx][1] += 1
                            posting_list[row_idx][2] = linked_list
                            break

                # first appearance of a word
                elif ((token not in stopwords) & (token not in punctuation)):
                    # lemmatizing
                    if lem == 1:
                        token = lemmatizer.lemmatize(token)
                        
                    linked_list = LinkedList()
                    linked_list.insert(int(doc.get_id()))
                    posting_list.append([token, 1, linked_list])
                    
        elif sw == 0:
            for token in tokens:
                if (any(token in row for row in posting_list)):
                    for row in posting_list:
                        if token in row:
                            row_idx = posting_list.index(row)
                            linked_list = posting_list[row_idx][2]
                            # if (not linked_list.search(int(doc.get_id()))):
                            linked_list.insert(int(doc.get_id()))
                            posting_list[row_idx][1] += 1
                            posting_list[row_idx][2] = linked_list
                            break

                # first appearance of a word
                else:
                    # lemmatizing
                    if lem == 1:
                        token = lemmatizer.lemmatize(token)
                        
                    linked_list = LinkedList()
                    linked_list.insert(int(doc.get_id()))
                    posting_list.append([token, 1, linked_list])
                    
                    
    return posting_list
                    


In [244]:
# a method for printing posting list

def print_posting_list(posting_list):
    for row in posting_list:
        linked_list = row[2]
        print("token: " + str(row[0]) + " " + ", repetition: " + str(row[1]))
        linked_list.print()
        

In [245]:
# TF

def tf(docs, posting_list):
    tf = []         #[token, docID, tf]
    
    for element in posting_list:
        for id in len(docs):
            tf.append([element, id, (posting_list[2].count(id)/)])
            row += 1
        
    return tf


In [246]:
# TF*IDF

def tf_idf(posting_list):
    row = 0
    tf_idf = []
    
    for element in posting_list:
        if row <= 10:
            tf_idf.append((element[1]/len(posting_list) * math.log10(len(posting_list)/element[1])))
            row += 1
        else:
            break
    if norm == 1:
        tf_idf = np.array(tf_idf)
        tf_idf = prep.normalize([tf_idf])
        
    return tf_idf


In [287]:
# .all
all_docs = read_from_file("CISI.ALL")
posting_list = preprocessing(all_docs)
# print_posting_list(posting_list)
all_tf = tf(posting_list)
print("all docs tf")
print(all_tf)
all_tf_idf = tf_idf(posting_list)
print("all docs tf-idf")
print(all_tf_idf)


# .qry
query_docs = read_from_file("CISI.QRY")
query_list = preprocessing(query_docs)
# # print_posting_list(query_list)
# query_tf = tf(query_list)
# print("query docs tf")
# print(query_tf)
# query_tf_idf = tf_idf(query_list)
# print("query docs tf-idf")
# print(query_tf_idf)

all docs tf
[0.005068834168141328, 0.008458054647431091, 0.0014396688761584835, 0.000419903422212891, 0.00014996550793317537, 0.005278785879247773, 0.0009297861491856873, 0.00014996550793317537, 5.998620317327015e-05, 0.00017995860951981044, 2.9993101586635075e-05]
all docs tf-idf
[[0.4690657  0.7068685  0.1649575  0.05717245 0.02312255 0.48474289
  0.11365352 0.02312255 0.01021151 0.02717252 0.0054698 ]]
