In [1]:
import os
import nltk
import numpy as np
import pandas as pd
import matplotlib as plt
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#Download nltk stuff
nltk.download('punkt')
nltk.download('stopwords')

#Set up stop words
stop_words = set(stopwords.words('english'))

preprocessed_directory = 'preprocessed_data'
all_words = set()
# list of all file names
filenames = []

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rida\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rida\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

#initialize the positional index as a dictionary
positional_index = {}
term_count = {}
term_total_max = {}
"""{doc : [# of terms, term with max occ.]}
    count all times each word appears within doc
    temp = {word : count}
    term_count = {doc : {word : count}}
"""

#Load all words from preprocessed files and build the positional index:
for filename in os.listdir(preprocessed_directory):
    file_path = os.path.join(preprocessed_directory,filename)
    temp = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        words = content.split()
        
        #iterate through each word and its index:
        for index, word in enumerate(words):
            if word not in positional_index:
                positional_index[word] = {}
            if filename not in positional_index[word]:
                positional_index[word][filename] = []
                temp[word] = 0
            positional_index[word][filename].append(index)
            temp[word] += 1
            
        term_count[filename] = temp.copy()
        m = max(temp, key=temp.get)
        term_total_max[filename] = [len(words), temp[m], m]
#position that the word occurs in is relative to the list of words and not the number of characters. So if document contains "This is", "This" is at position 0 and "is" at position 1
            
            
# Write positional index to a file:
with open('positional_index.txt', 'w') as file:
    for word, documents in positional_index.items():
        file.write(f"{word}: ")
        entries = []
        for doc, positions in documents.items():
            positions_str = ', '.join(map(str, positions))  # Convert list of positions to string
            entries.append(f"{doc} [{positions_str}]")
        document_positions = '; '.join(entries)  # Join all document entries with semicolon
        file.write(f"{document_positions}\n")
        file.write("\n")

print("Positional index complete")

Positional index complete


In [16]:
while True:
    phrase = input("Please enter a phrase:   ")
    phrase = phrase.lower()

    words = phrase.split()
    if len(words) > 5:
        print("Query length must be less than 5.")
    else:
        break



#Dictionary to store the combined document positions for the phrase.
phrase_positions = {}


#collect positions for each word in the phrase
for word in words:
    if word in positional_index:
        for document, positions in positional_index[word].items():
            if document not in phrase_positions:
                phrase_positions[document] = []
            phrase_positions[document].extend(positions)
            
#sorting the positions in each document
for doc in phrase_positions:
    phrase_positions[doc].sort()

#print:
#for doc, positions in phrase_positions.items():
    #print(f"{doc}: {positions}")
    
results = {}

# Check for sequences of consecutive numbers matching the phrase length
for doc, positions in phrase_positions.items():
    if len(positions) < len(words):
        continue  # Skip if there aren't enough positions

    # Search for consecutive positions
    for i in range(len(positions) - len(words) + 1):
        # Check if the next positions are consecutive
        if all(positions[i + j] == positions[i] + j for j in range(len(words))):
            if doc not in results:
                results[doc] = []
            results[doc].extend(positions[i:i+len(words)])  # Extend flat list

# Output the results:
for doc, pos_list in results.items():
    print(f"{doc}: {sorted(set(pos_list))}")  # Remove duplicates and sort
    print("\n")

dakota.txt: [15921, 15922]


hitch2.txt: [23937, 23938]


sucker.txt: [111, 112]




In [17]:
##################################################################
# TF-IDF MATRIX
"""
Term Frequency
Inverted Document Frequency

Matrix = columns[[row],[row],[row]] <-- way to reduce storage size?


same dataset as Q1
tf_dict = {'this' : {'100west.txt' : count, ...}}
for each word in dictionary:
    for each document in dictionary:
        dic = {}
        
        
        
"""
import math
DOC_TOTAL = 249

#term_count = {}
#doc_count = {}
idf = {}

for word in positional_index:
    #d = {}
    #for doc in positional_index[word]:
    #    d.update({doc : len(positional_index[word][doc])})
        
    #term_count.update({word : d.copy()})
    #doc_count.update({word : len(positional_index[word])})
    idf[word] = math.log(DOC_TOTAL / (len(positional_index[word])+1))
    

In [18]:
"""
final solution:
TF-IDF dataframe for each TF scheme (5 dataframes)

need:
total number of terms in each document: dict {doc : [# of terms, term with max occ]}
term with most amount of occurence in each document: -------------->^^^^^

create query vector => populate vector = vocab length with random values

"""

matrix = pd.DataFrame(float(0), index=list(idf.keys()),columns=list(term_count.keys()))

In [42]:
while True:
    tf_variant = input("Select term freq variant: bin | rc | tf | ln | dn")
    if tf_variant not in ['bin','rc','tf','ln','dn']:
        print("Invalid variant.")
    else:
        break
    

    
for doc, values in term_count.items():
    for term, count in values.items():
        match tf_variant:
            case 'bin':
                matrix.loc[term,doc] = idf[term]
            case 'rc':
                matrix.loc[term,doc] = count*idf[term]
            case 'tf':
                matrix.loc[term,doc] = (count/(term_total_max[doc][0]))*idf[term]
            case 'ln':
                matrix.loc[term,doc] = math.log(1+count)*idf[term]
            case 'dn':
                matrix.loc[term,doc] = (0.5 + 0.5*(count/term_total_max[doc][1]))*idf[term]

In [47]:
#np.random.seed = 3333
#query_vector = np.random.uniform(0, 5, 44108)
#query_vector = np.random.rand(44108)
q = input("enter query: ")
q = q.lower()
q = word_tokenize(q)
q = [word for word in q if word not in stop_words]
q = [re.sub(r'[\W_]+', '', word) for word in q if word]
q = [word for word in q if len(word) > 1]

query_count = {}

for word in q:
    if word not in query_count.keys():
        query_count[word] = 1
    else:
        query_count[word] += 1

query_vector = pd.Series(float(0), index=matrix.index)

In [48]:
for word, count in query_count.items():
    query_vector[word] = count
    """match tf_variant:
        case 'bin':
            query_vector[word] = idf[word]
        case 'rc':
            query_vector[word] = count*idf[word]
        case 'tf':
            query_vector[word] = (count/(term_total_max[doc][0]))*idf[word]
        case 'ln':
            query_vector[word] = math.log(1+count)*idf[word]
        case 'dn':
            query_vector[word] = (0.5 + (0.5* (count/term_total_max[doc][1])))*idf[word]"""

In [49]:
rank = pd.Series()
rank_cos = pd.Series()

for doc, arr in matrix.items():
    rank[doc] = np.dot(query_vector,arr)
    rank_cos[doc] = np.dot(query_vector,arr) / (np.linalg.norm(query_vector)*np.linalg.norm(arr))




In [50]:
print(rank.sort_values(ascending=False).head())
print()
print(rank_cos.sort_values(ascending=False).head())

sucker.txt      0.077948
vampword.txt    0.019607
empsjowk.txt    0.004563
lmtchgrl.txt    0.004432
narciss.txt     0.003949
dtype: float64

sucker.txt      0.421447
vampword.txt    0.046678
roger1.txt      0.044543
running.txt     0.035433
empsjowk.txt    0.034567
dtype: float64


In [36]:
"""
BINARY
pros:
cons:

RAW COUNT
pros:
cons:

TERM FREQUENCY
pros:
cons:

LOG NORMALIZATION
pros:
cons:

DOUBLE NORMALIZATION
pros:
cons:
"""

'\nBINARY\npros:\ncons:\n\nRAW COUNT\npros:\ncons:\n\nTERM FREQUENCY\n\n\n'