In [1]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import string
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def preprocess_document(doc_content, num):
    training_processed_tokens = []
    testing_processed_tokens = []
    iterations = int(num * 0.8)
    count = 0
    training_doc = ""
    testing_doc = ""

    for line in doc_content:
        if count < iterations:
            count += 1
            training_doc += " " + str(line)
            # Normalization
            normalized_text = line.lower()

            # Case Folding
            case_folded_text = normalized_text.casefold()

            # Regular expression to remove non-alphanumeric characters from words
            case_folded_text = re.sub(r'[^a-zA-Z0-9\s]', '', case_folded_text)

            # Tokenization
            tokens = word_tokenize(case_folded_text)

            # Remove punctuation
            tokens = [token for token in tokens if token not in string.punctuation]

            # Remove stop words
            stop_words = set(stopwords.words('english'))
            tokens = [token for token in tokens if token not in stop_words]

            # Add the processed tokens for this line to the result
            training_processed_tokens.extend(tokens)

        else:
            count += 1
            testing_doc += " " + str(line)
            # Normalization
            normalized_text = line.lower()

            # Case Folding
            case_folded_text = normalized_text.casefold()

            # Regular expression to remove non-alphanumeric characters from words
            case_folded_text = re.sub(r'[^a-zA-Z0-9\s]', '', case_folded_text)

            # Tokenization
            tokens = word_tokenize(case_folded_text)

            # Remove punctuation
            tokens = [token for token in tokens if token not in string.punctuation]

            # Remove stop words
            stop_words = set(stopwords.words('english'))
            tokens = [token for token in tokens if token not in stop_words]

            # Add the processed tokens for this line to the result
            testing_processed_tokens.extend(tokens)

    return training_processed_tokens, testing_processed_tokens, training_doc, testing_doc

# Function to perform lemmatization on a list of tokens
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Function to perform stemming on a list of tokens
def stem_tokens(tokens):
    porter_stemmer = PorterStemmer()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [3]:
# Set the path to the parent directory containing the 20 folders
parent_dir = 'D:\\Study\\2nd Semester\\Information Retrieval\\Assignments\\Assignment 1\\Dataset'
# Initialize an empty list to store the DataFrames
dfs_training = []
dfs_testing = []
training_tokenized_words = []
testing_tokenized_words = []
folders = os.listdir(parent_dir)
# Iterate over the 20 folders
for folder in folders:
    dfs_training_local = []
    dfs_testing_local = []
    folder_path = os.path.join(parent_dir, folder)
    subfolder = os.listdir(folder_path)

    # Iterate over the 1,000 files in the current folder
    for filename in subfolder:
        file_path = os.path.join(folder_path, filename)

        # Read the file line by line and count the lines
        with open(file_path, 'r', encoding='latin1') as file:
            lines = [line.strip() for line in file.readlines()]  # Read and strip each line, store in a list
            num_lines = len(lines)  # Get the number of lines in the file
            
            # Preprocess the document
            training_preprocessed_tokens, testing_preprocessed_tokens, training_doc, testing_doc = preprocess_document(lines, num_lines)
    
            # Lemmatization
            training_lemmatized_tokens = lemmatize_tokens(training_preprocessed_tokens)
            testing_lemmatized_tokens = lemmatize_tokens(testing_preprocessed_tokens)
    
            # Stemming
            training_stemmed_tokens = stem_tokens(training_lemmatized_tokens)
            testing_stemmed_tokens = stem_tokens(testing_lemmatized_tokens)
    
            training_tokenized_words += training_stemmed_tokens
            testing_tokenized_words += testing_stemmed_tokens

            dfs_training_local.append(training_doc)
            dfs_testing_local.append(testing_doc)
            
    dfs_training.append(dfs_training_local)
    dfs_testing.append(dfs_testing_local)

In [4]:
training_tokenized_words = set(training_tokenized_words)
training_tokenized_words = sorted(training_tokenized_words)
len_training_tokenized_words = len(training_tokenized_words)
print(len_training_tokenized_words)

73463


In [5]:
testing_tokenized_words = set(testing_tokenized_words)
testing_tokenized_words = sorted(testing_tokenized_words)
len_testing_tokenized_words = len(testing_tokenized_words)
print(len_testing_tokenized_words)

24378


In [6]:
training_tokenized_words

['0',
 '00',
 '000',
 '0000',
 '00000',
 '000001',
 '000003',
 '000005102000',
 '00000f',
 '000029',
 '000055',
 '000101',
 '000104',
 '000106',
 '000108',
 '000109',
 '000122',
 '000129',
 '000138',
 '000148',
 '0001someth',
 '0001xxxx',
 '0002',
 '000211',
 '00021147',
 '000250',
 '0003',
 '000304',
 '000343',
 '000348',
 '000508',
 '000531',
 '000546',
 '000551',
 '000552',
 '0006',
 '000600',
 '000635',
 '000652',
 '000657',
 '000750',
 '000758',
 '000801',
 '000836',
 '000845',
 '000853',
 '000856',
 '000901',
 '000922',
 '000958',
 '001',
 '001041',
 '001044',
 '001047',
 '0010580b0b6r49diablouucp',
 '0010580bvm9ve7diablouucp',
 '0010580bvma7o9diablouucp',
 '0010580bvmcbrtdiablouucp',
 '0010580bvmede1diablouucp',
 '001124',
 '001127',
 '001143',
 '001149',
 '001201',
 '001212',
 '001224',
 '001351',
 '001401',
 '001442',
 '001451',
 '001456',
 '001631',
 '001639',
 '001837',
 '001852',
 '001857',
 '001908',
 '001926',
 '001934',
 '001945',
 '00196',
 '002',
 '002005',
 '002023',


In [7]:
testing_tokenized_words

['0',
 '00',
 '0000',
 '000100255pixel',
 '001',
 '001200201pixel',
 '002',
 '002202346',
 '003923283896',
 '004',
 '006',
 '009',
 '00d6p0d0d0el',
 '00yfbu',
 '01',
 '01003',
 '01075',
 '012593',
 '013',
 '0139',
 '014',
 '014510667',
 '0150',
 '015605',
 '015mb',
 '016092280',
 '016285716',
 '016285799',
 '01701',
 '017010405',
 '01742',
 '018035297',
 '01821',
 '0184',
 '01845',
 '01854',
 '018673238',
 '01890',
 '01obs0n',
 '01ud79a6h',
 '02',
 '020',
 '0200',
 '0201502550',
 '0201503000',
 '0201570254',
 '021387510',
 '02139',
 '02142',
 '02154',
 '022293',
 '0224',
 '0235',
 '02357',
 '023816',
 '025',
 '02543',
 '0293',
 '02l0z81m4klqrfde9',
 '03',
 '030',
 '03016mhz',
 '030200',
 '03025mhz',
 '0303',
 '030542934',
 '030600',
 '031',
 '0316679764',
 '033474244',
 '0353',
 '0382',
 '04',
 '040',
 '04025mhz',
 '040593',
 '040mb',
 '041',
 '0410',
 '041293',
 '0423231111',
 '0423277742',
 '042553411',
 '0442230000',
 '0454616616',
 '045u7',
 '0464',
 '047021340',
 '047048978',
 '04

In [8]:
Incident_Matrix = []
for word in training_tokenized_words:
    Incident_Matrix_Local = []
    for docs in dfs_training:
        for words in docs:
            if word in words:
                Incident_Matrix_Local.append(1)
            else:
                Incident_Matrix_Local.append(0)
    Incident_Matrix.append(Incident_Matrix_Local)

In [9]:
pd.DataFrame(Incident_Matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,0,0,0,0,0,0,1,...,0,1,1,1,0,1,0,0,1,1
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73458,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
Inverted_Index_Matrix = []
lnt = len(Incident_Matrix[0])
for i in range(len_training_tokenized_words):
    lcl=[]
    word_dict={}
    for j in range(lnt):
        if Incident_Matrix[i][j]==1:
            lcl.append(j)
    word_dict[training_tokenized_words[i]]=lcl
    Inverted_Index_Matrix.append(word_dict)

In [11]:
Inverted_Index_Matrix

[{'0': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   72,
   73,
   74,
   75,
   76,
   77,
   78,
   79,
   80,
   81,
   82,
   83,
   84,
   85,
   86,
   87,
   88,
   89,
   90,
   91,
   92,
   93,
   94,
   95,
   96,
   97,
   98,
   99,
   100,
   101,
   102,
   103,
   104,
   105,
   106,
   107,
   108,
   109,
   110,
   111,
   112,
   113,
   114,
   115,
   116,
   117,
   118,
   119,
   120,
   121,
   122,
   123,
   124,
   125,
   126,
   127,
   128,
   129,
   130,
   131,
   132,
   133,
   134,
   135,
   136,
   137,
   138,


In [12]:
query_split = len_testing_tokenized_words//10
for i in range(10):
    query = testing_tokenized_words[query_split*i:query_split*(i+1):13]
    for word in query:
        for item in Inverted_Index_Matrix:
            if word in item.keys():
                print(item)
                break

{'0': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 170, 171, 172, 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 22