In [2]:
%%time
import nltk
import os
import math
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpusroot = "C:/Users/rahul/Downloads/P1/P1/US_Inaugural_Addresses"
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

count = 0
All_tokens = dict()
Document_weights = dict()
idf_values = dict()

# document preprocessing
for filename in os.listdir(corpusroot):
    if filename.startswith(('0', '1', '2', '3')):
        count = count + 1
        file = open(os.path.join(corpusroot, filename), "r", encoding='windows-1252')
        doc = file.read()
        file.close()
        doc = doc.lower()
        tokens = tokenizer.tokenize(doc)
        filtered_tokens = []
        for token in tokens:
            if token not in stop_words:
                filtered_tokens.append(stemmer.stem(token))
        All_tokens[filename]=filtered_tokens
        token_set = set(filtered_tokens)
        for token in token_set:
            idf_values[token] = idf_values.get(token, 0) + 1

            

# does stemming for the given token:
def preprocessing(token):
    token = token.lower()
    stemmer = PorterStemmer()
    if token not in stop_words:
        stemmed_token = stemmer.stem(token)
    return stemmed_token

for word, df in idf_values.items():
    idf_values[word] = math.log10(count / df)

# idf calculation:
def getidf(token):
    stemmed_token = preprocessing(token)
    if stemmed_token not in idf_values.keys():
        return -1
    else:
        return idf_values[stemmed_token]


# tf-idf calculation:
def getweight(filename, token):
    if filename in All_tokens.keys():
        document = All_tokens[filename]
    else:
        return 0
    doc_weights = dict()
    query = False
    stemmed_token = preprocessing(token)
    if stemmed_token not in document:
        return 0
    else:
        doc_weights = document_weights(document,query)
        tf_idf_weight = doc_weights[stemmed_token]
        return tf_idf_weight
    
    
# calculate normalised tf-idf weight for a document
def document_weights(document,query):
    doc_weights = dict()
    counts = Counter(document)
    for word, occurence in counts.items(): 
        tf = counts[word]
        ltf = 1 + math.log10(tf)
        if query == False:
            idf = idf_values[word]
        else:
            idf = 1
        tf_idf = idf * ltf
        doc_weights[word] = tf_idf
    squares = sum(x**2 for x in doc_weights.values())
    magnitude = math.sqrt(squares)
    for key, value in doc_weights.items():
        doc_weights[key] = value / magnitude
    return doc_weights

for filename in All_tokens.keys():
    query = False
    Document_weights[filename] = document_weights(All_tokens[filename],query)

# query-doc similarity:
def query(qstring):
    stemmed_tokens = []
    query = True
    q_weights = dict()
    score = dict()
    qstring = qstring.lower()
    tokens = tokenizer.tokenize(qstring)
    for token in tokens:
        if token not in stop_words:
            stemmed_tokens.append(stemmer.stem(token))
    q_weights = document_weights(stemmed_tokens,query)

    for filename, doc_weights in Document_weights.items():
        similarity = []
        for key in q_weights.keys() & doc_weights.keys():
            similarity.append(q_weights.get(key,0) * doc_weights.get(key,0))
        sim_score = sum(similarity)
        score[filename] = sim_score
    max_score = max(score.values())    
    for key,value in score.items():
        if max_score == value:
            filename = key
    return filename, max_score
        
    
    
    
print("%.12f" % getidf('children'))
print("%.12f" % getidf('foreign'))
print("%.12f" % getidf('people'))
print("%.12f" % getidf('honor'))
print("%.12f" % getidf('great'))
print("--------------")
print("%.12f" % getweight('19_lincoln_1861.txt','constitution'))
print("%.12f" % getweight('23_hayes_1877.txt','public'))
print("%.12f" % getweight('25_cleveland_1885.txt','citizen'))
print("%.12f" % getweight('09_monroe_1821.txt','revenue'))
print("%.12f" % getweight('05_jefferson_1805.txt','press'))
print("--------------")
print("(%s, %.12f)" % query("pleasing people"))
print("(%s, %.12f)" % query("war offenses"))
print("(%s, %.12f)" % query("british war"))
print("(%s, %.12f)" % query("texas government"))
print("(%s, %.12f)" % query("cuba government"))

    
    
    
    

0.574031267728
0.134698573897
0.029963223377
0.079181246048
0.045757490561
--------------
0.005351714939
0.003659885335
0.001990612219
0.023996540734
0.039311641490
--------------
(03_adams_john_1797.txt, 0.044190057362)
(20_lincoln_1865.txt, 0.136596561747)
(07_madison_1813.txt, 0.082936482104)
(15_polk_1845.txt, 0.070347633806)
(29_mckinley_1901.txt, 0.096775365055)
CPU times: total: 1.2 s
Wall time: 2.07 s
