In [1]:
import os
import nltk
import numpy as np
import pandas as pd
import matplotlib as plt
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#Download nltk stuff
nltk.download('punkt')
nltk.download('stopwords')

#Set up stop words
stop_words = set(stopwords.words('english'))

preprocessed_directory = 'preprocessed_data'
all_words = set()
# list of all file names
filenames = []

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rida\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rida\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Loop through each file in the directory

for filename in os.listdir('data'):
    #Constructs file path for a specific file in data folder
    file_path = os.path.join('data',filename)
    filenames.append(filename)
    print(file_path)
    
    # Had errors reading certain files, so try different encodings
    try:
        #Use utf-8 encoding
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    except UnicodeDecodeError:
        #Try with a different encoding
        with open(file_path, 'r', encoding='latin-1') as file:
            content = file.read()
        
    #Convert to lowercase
    content_lower = content.lower()
    #Create tokens
    tokens = word_tokenize(content_lower)
    #Remove stop words 
    filtered_tokens = [word for word in tokens if word not in stop_words]
    #Remove punctuation 
    processed_tokens = [re.sub(r'[\W_]+', '', word) for word in tokens if word]  # Remove punctuation
    #Remove singly occurring characters like 'm' or 'a'
    processed_tokens = [word for word in processed_tokens if len(word) > 1]

    #Add processed tokens to the set
    all_words.update(processed_tokens)
    processed_text = ' '.join(processed_tokens)

    #Gets the file path to write the processed data
    preprocessed_file_path = os.path.join(preprocessed_directory, filename)

    #Write processed text to preprocessed_data
    with open(preprocessed_file_path, 'w', encoding='utf-8') as file:
        file.write(processed_text)


data\100west.txt
data\13chil.txt
data\3gables.txt
data\3lpigs.txt
data\3student.txt
data\3wishes.txt
data\4moons.txt
data\5orange.txt
data\6ablemen.txt
data\6napolen.txt
data\7oldsamr.txt
data\7voysinb.txt
data\ab40thv.txt
data\abbey.txt
data\abyss.txt
data\adler.txt
data\advsayed.txt
data\advtthum.txt
data\adv_alad.txt
data\aesop11.txt
data\aesopa10.txt
data\aircon.txt
data\aislesix.txt
data\alad10.txt
data\alissadl.txt
data\aminegg.txt
data\angry_ca.txt
data\antcrick.txt
data\aquith.txt
data\arctic.txt
data\assorted.txt
data\bagelman.txt
data\batlslau.txt
data\beautbst.txt
data\beggars.txt
data\berternie.txt
data\bgb.txt
data\bgcspoof.txt
data\bishop00.txt
data\blabnove.txt
data\blackp.txt
data\blh.txt
data\blind.txt
data\bluebrd.txt
data\bruce-p.txt
data\buggy.txt
data\buldetal.txt
data\buldream.txt
data\bulfelis.txt
data\bulhuntr.txt
data\bulironb.txt
data\bullove.txt
data\bulmrx.txt
data\bulnland.txt
data\bulnoopt.txt
data\bulolli1.txt
data\bulolli2.txt
data\bulphrek.txt
data\bulp

In [5]:
print(f"Total unique words: {len(all_words)}")
print("Question 1 Completed")


Total unique words: 44108
Question 1 Completed


In [2]:

#initialize the positional index as a dictionary
positional_index = {}
term_count = {}
term_total_max = {}
"""{doc : [# of terms, term with max occ.]}
    count all times each word appears within doc
    temp = {word : count}
    term_count = {doc : {word : count}}
"""

#Load all words from preprocessed files and build the positional index:
for filename in os.listdir(preprocessed_directory):
    file_path = os.path.join(preprocessed_directory,filename)
    temp = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        words = content.split()
        
        #iterate through each word and its index:
        for index, word in enumerate(words):
            if word not in positional_index:
                positional_index[word] = {}
            if filename not in positional_index[word]:
                positional_index[word][filename] = []
                temp[word] = 0
            positional_index[word][filename].append(index)
            temp[word] += 1
            
        term_count[filename] = temp.copy()
        m = max(temp, key=temp.get)
        term_total_max[filename] = [len(words), temp[m], m]
#position that the word occurs in is relative to the list of words and not the number of characters. So if document contains "This is", "This" is at position 0 and "is" at position 1
            
            
# Write positional index to a file:
with open('positional_index.txt', 'w') as file:
    for word, documents in positional_index.items():
        file.write(f"{word}: ")
        entries = []
        for doc, positions in documents.items():
            positions_str = ', '.join(map(str, positions))  # Convert list of positions to string
            entries.append(f"{doc} [{positions_str}]")
        document_positions = '; '.join(entries)  # Join all document entries with semicolon
        file.write(f"{document_positions}\n")
        file.write("\n")

print("Positional index complete")

Positional index complete


In [6]:
while True:
    phrase = input("Please enter a phrase:   ")
    phrase = phrase.lower()

    words = phrase.split()
    if len(words) > 5:
        print("Query length must be less than 5.")
    else:
        break



#Dictionary to store the combined document positions for the phrase.
phrase_positions = {}


#collect positions for each word in the phrase
for word in words:
    if word in positional_index:
        for document, positions in positional_index[word].items():
            if document not in phrase_positions:
                phrase_positions[document] = []
            phrase_positions[document].extend(positions)
            
#sorting the positions in each document
for doc in phrase_positions:
    phrase_positions[doc].sort()

#print:
#for doc, positions in phrase_positions.items():
    #print(f"{doc}: {positions}")
    
results = {}

# Check for sequences of consecutive numbers matching the phrase length
for doc, positions in phrase_positions.items():
    if len(positions) < len(words):
        continue  # Skip if there aren't enough positions

    # Search for consecutive positions
    for i in range(len(positions) - len(words) + 1):
        # Check if the next positions are consecutive
        if all(positions[i + j] == positions[i] + j for j in range(len(words))):
            if doc not in results:
                results[doc] = []
            results[doc].extend(positions[i:i+len(words)])  # Extend flat list

# Output the results:
for doc, pos_list in results.items():
    print(f"{doc}: {sorted(set(pos_list))}")  # Remove duplicates and sort
    print("\n")

In [50]:
##################################################################
# TF-IDF MATRIX
"""
Term Frequency
Inverted Document Frequency

Matrix = columns[[row],[row],[row]] <-- way to reduce storage size?


same dataset as Q1
tf_dict = {'this' : {'100west.txt' : count, ...}}
for each word in dictionary:
    for each document in dictionary:
        dic = {}
        
        
        
"""
import math
DOC_TOTAL = 249

tc = {}
#doc_count = {}
idf = {}

for word in positional_index:
    d = {}
    for doc in positional_index[word]:
        d.update({doc : len(positional_index[word][doc])})
        
    tc.update({word : d.copy()})
    #doc_count.update({word : len(positional_index[word])})
    idf[word] = math.log(DOC_TOTAL / (len(positional_index[word])+1))
    

In [54]:
#idf = {}
#idf['is'] = math.log(DOC_TOTAL/(doc_count['is'] + 1))

In [47]:
"""
final solution:
TF-IDF dataframe for each TF scheme (5 dataframes)

need:
total number of terms in each document: dict {doc : [# of terms, term with max occ]}
term with most amount of occurence in each document: -------------->^^^^^

create query vector => populate vector = vocab length with random values

"""

matrix_bin = pd.DataFrame(float(0), index=list(idf.keys()),columns=list(term_count.keys()))
matrix_rc = pd.DataFrame(float(0), index=list(idf.keys()),columns=list(term_count.keys()))
matrix_tf = pd.DataFrame(float(0), index=list(idf.keys()),columns=list(term_count.keys()))
matrix_ln = pd.DataFrame(float(0), index=list(idf.keys()),columns=list(term_count.keys()))
matrix_dn = pd.DataFrame(float(0), index=list(idf.keys()),columns=list(term_count.keys()))

In [None]:
term_count ## {doc : {term : count}} <-- may change to {term : {doc : count}}
term_total_max ## {doc : [length, max term count, max term]}
idf ## {term : score}


In [41]:
for doc, values in term_count.items():
    for term, count in values.items():
        matrix_bin.loc[term,doc] = idf[term]
        matrix_rc.loc[term,doc] = count*idf[term]
        matrix_tf.loc[term,doc] = (count/(term_total_max[doc][0]))*idf[term]
        matrix_ln.loc[term,doc] = math.log(1+count)*idf[term]
        matrix_dn.loc[term,doc] = 0.5 + (0.5* (count/term_total_max[doc][1]))*idf[term]

In [51]:
for term, values in tc.items():
    for doc, count in values.items():
        matrix_bin.loc[term,doc] = idf[term]
        matrix_rc.loc[term,doc] = count*idf[term]
        matrix_tf.loc[term,doc] = (count/(term_total_max[doc][0]))*idf[term]
        matrix_ln.loc[term,doc] = math.log(1+count)*idf[term]
        matrix_dn.loc[term,doc] = 0.5 + (0.5* (count/term_total_max[doc][1]))*idf[term]

In [46]:
matrix_dn

Unnamed: 0,100west.txt,13chil.txt,3gables.txt,3lpigs.txt,3student.txt,3wishes.txt,4moons.txt,5orange.txt,6ablemen.txt,6napolen.txt,...,vgilante.txt,weaver.txt,weeprncs.txt,wisteria.txt,wlgirl.txt,wolf7kid.txt,wolfcran.txt,wolflamb.txt,yukon.txt,zombies.txt
this,0.501137,0.502927,0.507884,0.501672,0.506201,0.505996,0.503206,0.503708,0.50111,0.503758,...,0.503421,0.503854,0.503801,0.505261,0.500680,0.501445,0.0,0.502434,0.501171,0.502554
is,0.511432,0.504646,0.520298,0.500884,0.515584,0.501359,0.505088,0.510841,0.00000,0.512235,...,0.505493,0.513764,0.000000,0.515205,0.507017,0.000000,0.0,0.503864,0.507898,0.501753
shareware,0.508465,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
trial,0.505167,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.500155,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
project,0.505500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.505433,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
priveledge,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.503600
freshest,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.503600
wellmaintained,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.503600
sentenial,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.503600
