In [7]:
import os
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import math
from collections import defaultdict

# Reading the 30 .txt files
corpus_root = './US_Inaugural_Addresses'
docs = {}
for filename in os.listdir(corpus_root):
    if filename.endswith('.txt'):
        file = open(os.path.join(corpus_root, filename), "r", encoding='windows-1252')
        doc = file.read().lower()
        docs[filename] = doc
        
# Tokenizing the content of each file
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
tokenized_docs = {}
for filename, doc in docs.items():
    tokens = tokenizer.tokenize(doc)
    tokenized_docs[filename] = tokens

# Removing stopwords
sw = stopwords.words('english')
docs_without_sw = {}
for filename, tokens in tokenized_docs.items():
    no_sw_tokens = [token for token in tokens if token not in sw]
    docs_without_sw[filename] = no_sw_tokens

# Performing stemming on the obtained tokens from each document
stemmer = PorterStemmer()
stemmed_docs = {}
for filename, tokens in docs_without_sw.items():
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stemmed_docs[filename] = stemmed_tokens

# Function to calculate IDF
def getidf(token):
    stemmer = PorterStemmer()
    num_docs = len(docs_without_sw)
    stemmed_token = stemmer.stem(token)
    df = sum(1 for doc_tokens in stemmed_docs.values() if stemmed_token in doc_tokens)
    if df > 0:
        idf = math.log10(num_docs / df)
        return idf
    else:
        return -1
    
# Function to calculate IDF used in getweight and query without stemming the token, since the tokens will be intially stemmed in the respective functions
def getidf_nostemming(token):
    num_docs = len(stemmed_docs)
    df = sum(1 for doc_tokens in stemmed_docs.values() if token in doc_tokens)
    if df > 0:
        idf = math.log10(num_docs / df)
        return idf
    else:
        return -1

# Function to calculate TF-IDF weight
def getweight(filename, token):
    stemmer = PorterStemmer()
    stemmed_token = stemmer.stem(token)
    num_docs = len(stemmed_docs)
    tf = stemmed_docs[filename].count(stemmed_token)
    if tf == 0:
        return 0
    df = sum(1 for doc in stemmed_docs.values() if stemmed_token in doc)
    idf = getidf_nostemming(stemmed_token)
    tf_idf = (1 + math.log10(tf)) * idf
    magnitude_doc = 0
    x = set(stemmed_docs[filename])
    for tok in x:
        tf_term = stemmed_docs[filename].count(tok)
        idf_term = getidf_nostemming(tok)
        tf_idf_term = (1 + math.log10(tf_term)) * idf_term
        magnitude_doc += tf_idf_term ** 2
    magnitude_doc = math.sqrt(magnitude_doc)
    
    # Avoid division by zero
    if magnitude_doc == 0:
        return 0
    
    # Calculate normalized TF-IDF
    tf_idf /= magnitude_doc
    
    return tf_idf

# Function to process a query and find the most relevant document
def query(qstring):
    stemmer = PorterStemmer()
    query_words = qstring.lower().split()
    stemmed_query_words = [stemmer.stem(word) for word in query_words]

    query_vector = defaultdict(int)
    for word in stemmed_query_words:
        query_vector[word] += 1

    # Compute TF-IDF for the query using ltc.lnc weighting scheme
    query_tfidf = {}
    for token, tf in query_vector.items():
        tf_idf = (1 + math.log10(tf))
        query_tfidf[token] = tf_idf
    
    max_score = -1
    max_score_document = "No documents found"

    # Compute cosine similarity between query and documents
    for filename, tokens in stemmed_docs.items():
        doc_vector = defaultdict(int)
        for token in tokens:
            doc_vector[token] += 1

        # Compute TF-IDF for document using ltc.lnc weighting scheme
        doc_tfidf = {}
        for token, tf in doc_vector.items():
            df = sum(1 for doc_tokens in stemmed_docs.values() if token in doc_tokens)
            idf = math.log10(len(stemmed_docs) / df) if df > 0 else -1
            tf_idf = (1 + math.log10(tf)) * idf
            doc_tfidf[token] = tf_idf

        # Calculate cosine similarity between query and document
        dot_product = sum(query_tfidf[token] * doc_tfidf[token] for token in query_tfidf if token in doc_tfidf)
        query_mag = math.sqrt(sum(query_tfidf[token] ** 2 for token in query_tfidf))
        doc_mag = math.sqrt(sum(doc_tfidf[token] ** 2 for token in doc_tfidf))
        
        if query_mag != 0 and doc_mag != 0:
            similarity_score = dot_product / (query_mag * doc_mag)
        
        if similarity_score > max_score:
            max_score = similarity_score
            max_score_document = filename

    # Return the document with its score
    return (max_score_document, max_score)

print("%.12f" % getidf('children'))
print("%.12f" % getidf('foreign'))
print("%.12f" % getidf('people'))
print("%.12f" % getidf('honor'))
print("%.12f" % getidf('great'))
print("--------------")
print("%.12f" % getweight('19_lincoln_1861.txt','constitution'))
print("%.12f" % getweight('23_hayes_1877.txt','public'))
print("%.12f" % getweight('25_cleveland_1885.txt','citizen'))
print("%.12f" % getweight('09_monroe_1821.txt','revenue'))
print("%.12f" % getweight('05_jefferson_1805.txt','press'))
print("--------------")
print("(%s, %.12f)" % query("pleasing people"))
print("(%s, %.12f)" % query("war offenses"))
print("(%s, %.12f)" % query("british war"))
print("(%s, %.12f)" % query("texas government"))
print("(%s, %.12f)" % query("cuba government"))

0.574031267728
0.134698573897
0.029963223377
0.079181246048
0.045757490561
--------------
0.005351714939
0.003659885335
0.001990612219
0.023996540734
0.039311641490
--------------
(03_adams_john_1797.txt, 0.044190057362)
(20_lincoln_1865.txt, 0.136596561747)
(07_madison_1813.txt, 0.082936482104)
(15_polk_1845.txt, 0.070347633806)
(29_mckinley_1901.txt, 0.096775365055)
