In [None]:
import os
import math
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

N = 0
corpusroot = './Datathon-2023/'
docs = {}
for filename in os.listdir(corpusroot):
    if filename.startswith('0') or filename.startswith('1'):
        with open(os.path.join(corpusroot, filename), "r", encoding = 'utf-8') as doc:
            docs[filename] = doc.read()
            N += 1

# tokenize and stem the documents
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stemmer = PorterStemmer()
doc_tokens = {}

for filename, doc in docs.items():
    tokens = [stemmer.stem(token.lower()) for token in tokenizer.tokenize(
        doc) if token.lower() not in stop_words]
    doc_tokens[filename] = tokens

idf = {}
for token in set([token for tokens in doc_tokens.values() for token in tokens]):
    df = sum([1 for tokens in doc_tokens.values() if token in tokens])
    idf[token] = math.log10(N / df)

# compute the lnc.ltc weights
def calculate_weights(flag):
    doc_weights = {}
    for filename, tokens in doc_tokens.items():
        tf = {}
        for token in tokens: # for each file counting the occurances of the token i.e. tf_raw
            if token in tf: 
                tf[token] += 1 # stores tf_raw for each token in the document 
            else:
                tf[token] = 1

        doc_weights[filename] = {}
        for token, freq in tf.items():
            
            # if token in idf:
                weight = (1 + math.log10(freq))
                if (flag==True):
                    doc_weights[filename][token] = weight * idf[token]
                else:
                    doc_weights[filename][token] = weight 
    return doc_weights


def query(qstring):
    # Tokenize and stem query
    tokens = [stemmer.stem(token.lower()) for token in tokenizer.tokenize(qstring) if token.lower() not in stop_words]
    
    # Calculate query weights - logtf of query tokens 
    query_weights = {}
    for token in tokens:
        if token not in query_weights:
            temp = tokens.count(token)
            if temp == 0:
                query_weights[token] = 1
            else:
                query_weights[token] = (1 + math.log10(temp))

    # Initialize document values
    values = {filename: 0 for filename in docs}
    
    query_tf_idf = {}
    
    # Calculate idf query term
    for token, weight in query_weights.items():
        query_idf = getidf(token)
        if query_idf == -1:
            query_idf = 0
        query_tf_idf[token] = weight * query_idf

    # Query magnitude
    query_mag = sum([weight*weight for weight in query_tf_idf.values() if weight != 0])
    query_mag = math.sqrt(query_mag)

    # Normalised Query vectors
    for val in query_tf_idf:
        query_tf_idf[val] = query_tf_idf[val] / query_mag
    
    # Document magnitude    
    doc_mag = {}
    doc_weights = calculate_weights(False)
    for doc in doc_weights:
        x = sum([weight*weight for weight in doc_weights[doc].values() if weight != 0])
        doc_mag[doc] = math.sqrt(x)

    #Normalised Document vectors
    for doc,vals in doc_weights.items():
        mag_doc = doc_mag[doc]
        for t,v in vals.items():
            x = v / mag_doc
            doc_weights[doc][t] = x
     
    result = {}     
    for filename in doc_weights:
        for token in query_tf_idf:
            if token in doc_weights[filename]:
                values[filename] += doc_weights[filename][token] * query_tf_idf[token]
        result[filename] = values

    sorted_values = sorted(values.items(), key = lambda x: x[1], reverse = True)

    return sorted_values[0][0], sorted_values[0][1]

def getidf(token):
    if token in idf:
        return idf[token]
    else:
        return -1

def getweight(filename, token):
    token = stemmer.stem(token.lower())
    doc_weights = calculate_weights(True)
    if token in doc_weights[filename]:
        return doc_weights[filename][token]
    else:
        return -1
    
print("%.12f" % getidf('british'))
print("%.12f" % getidf('union'))
print("%.12f" % getidf('war'))
print("%.12f" % getidf('power'))
print("%.12f" % getidf('great'))
print("--------------")
print("%.12f" % getweight('02_washington_1793.txt', 'arrive'))
print("%.12f" % getweight('07_madison_1813.txt', 'war'))
print("%.12f" % getweight('12_jackson_1833.txt', 'union'))
print("%.12f" % getweight('09_monroe_1821.txt', 'great'))
print("%.12f" % getweight('05_jefferson_1805.txt', 'public'))
print("--------------")
print("(%s, %.12f)" % query("pleasing people"))
print("(%s, %.12f)" % query("british war"))
print("(%s, %.12f)" % query("false public"))
print("(%s, %.12f)" % query("people institutions"))
print("(%s, %.12f)" % query("violated willingly"))