Trupti Shriyan                                                                                                                                1002223250
Programming Assignment                                                                                                                       Data Mining

In [28]:
#Importing Necessary functions
import os
import math
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [29]:
# Setting up processing functions
stop_words_in_text = set(stopwords.words('english'))
stemmer_text = PorterStemmer()
tokenizer_text = RegexpTokenizer(r'[a-zA-Z]+')

In [30]:
#Loading and preprocessing the documents
def load_documents(directory_path):
    docs = []
    files = []  # To track filenames for reference
    for file_name in os.listdir(root_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(root_path, file_name)
            with open(file_path, 'r', encoding='windows-1252') as file:
                doc = file.read().lower()
            tokens = tokenizer_text.tokenize(doc)
            tokens = [token for token in tokens if token not in stop_words_in_text]
            tokens = [stemmer_text.stem(token) for token in tokens]
            docs.append(tokens)
            files.append(file_name)
    return docs, files

In [31]:
#Specifying the path where my acutal data is present
root_path = 'US_Inaugural_Addresses'

In [32]:
docs, files = load_documents(root_path)
print("Documents loaded and preprocessed.")

Documents loaded and preprocessed.


In [33]:
# BUilding the Term frequency - Inverse Document Frequency model
def build_tfidf(docs):
    N = len(docs)
    document_frequency = Counter()
    for tokens in docs:
        unique_tokens = set(tokens)
        for token in unique_tokens:
            document_frequency[token] += 1

    tfidf_vectors = []
    for tokens in docs:
        tf = Counter(tokens)
        tfidf = {}
        document_length = 0
        for token, count in tf.items():
            if token in document_frequency:
                tf_value = 1 + math.log10(count)
                idf_value = math.log10(N / document_frequency[token])
                tfidf[token] = tf_value * idf_value
                document_length += tfidf[token] ** 2

        document_length = math.sqrt(document_length)
        for token in tfidf:
            tfidf[token] /= document_length
        tfidf_vectors.append(tfidf)
    return tfidf_vectors, document_frequency


In [34]:
tfidf_vectors, document_frequency = build_tfidf(docs)
print("TF-IDF model built.")

TF-IDF model built.


In [35]:
# Query Vector Calculation 
def calculate_query_vector(query):
    tokens = tokenizer_text.tokenize(query.lower())
    filtered_tokens = [token for token in tokens if token not in stop_words_in_text]
    stemmed_tokens = [stemmer_text.stem(token) for token in filtered_tokens]
    tf_query = Counter(stemmed_tokens)
    query_vector = {}
    for token, count in tf_query.items():
        if token in document_frequency:
            tf_value = 1 + math.log10(count)
            idf_value = math.log10(len(docs) / document_frequency[token])
            query_vector[token] = tf_value * idf_value
    return query_vector

In [36]:
# Defining Cosine Similarity function
def cosine_similarity(vector_1, vector_2):
    intersection = set(vector_1.keys()) & set(vector_2.keys())
    numerator = sum([vector_1[x] * vector_2[x] for x in intersection])
    sum_1 = sum([value ** 2 for value in vector_1.values()])
    sum_2 = sum([value ** 2 for value in vector_2.values()])
    denominator = math.sqrt(sum_1) * math.sqrt(sum_2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [37]:
def getidf(token):
    token = stemmer_text.stem(token)
    return math.log10(len(docs) / document_frequency.get(token, len(docs)))


In [38]:
def getweight(file_name, token):
    token = stemmer_text.stem(token)
    document_index = files.index(file_name)  # Using files list to get index
    return tfidf_vectors[document_index].get(token, 0.0)

In [39]:
def query(query_string):
    query_vector = calculate_query_vector(query_string)
    similarities = [(files[i], cosine_similarity(query_vector, document_vector)) for i, document_vector in enumerate(tfidf_vectors)]
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[0] if similarities else ("None", 0)

In [40]:
# Query the system
results = query("We stand to-day with the Constitution")
print(f"The most relevant document: {results}")

The most relevant document: ('22_grant_1873.txt', 0.04422464468088289)


In [41]:
# Test the functions
print("%.12f" % getidf('british'))
print("%.12f" % getidf('union'))
print("%.12f" % getidf('dollar'))
print("%.12f" % getidf('constitution'))
print("%.12f" % getidf('power'))
print("--------------")
print("%.12f" % getweight('19_lincoln_1861.txt','states'))
print("%.12f" % getweight('07_madison_1813.txt','war'))
print("%.12f" % getweight('05_jefferson_1805.txt','false'))
print("%.12f" % getweight('22_grant_1873.txt','proposition'))
print("%.12f" % getweight('16_taylor_1849.txt','duties'))
print("--------------")
print("(%s, %.12f)" % query("executive power"))
print("(%s, %.12f)" % query("foreign government"))
print("(%s, %.12f)" % query("public rights"))
print("(%s, %.12f)" % query("people government"))
print("(%s, %.12f)" % query("states laws"))

0.875061263392
0.115393418702
1.176091259056
0.045757490561
0.029963223377
--------------
0.005140064841
0.019316338877
0.095677997843
0.130886844655
0.005091152688
--------------
(02_washington_1793.txt, 0.028040582186)
(16_taylor_1849.txt, 0.016937704621)
(11_jackson_1829.txt, 0.005111777487)
(02_washington_1793.txt, 0.007771588100)
(21_grant_1869.txt, 0.013452928185)
