In [15]:
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.stem.snowball import SnowballStemmer
from collections import defaultdict
from nltk.corpus import stopwords
import math

In [29]:
class CorpusReader_TFIDF:
    """
    Initializes the CorpusReader_TFIDF with specified parameters.

    :param corpus: An NLTK corpus object.
    :param tf: Term frequency calculation method ('raw' or 'log').
    :param idf: Inverse document frequency calculation method ('base' or 'smooth').
    :param stopWord: Stopword removal strategy ('none', 'standard', or filepath to custom stopwords).
    :param toStem: Boolean indicating whether to apply stemming.
    :param stemFirst: Boolean indicating whether to stem before removing stopwords.
    :param ignoreCase: Boolean indicating whether to ignore case.
    """
    def __init__(self, corpus, tf="raw", idf="base", stopWord="none", toStem=False, stemFirst=False, ignoreCase=True):
        self.corpus = corpus
        self.tf_method = tf
        self.idf_method = idf
        self.stopWord =  stopWord 
        self.toStem = toStem
        self.stemFirst = stemFirst
        self.ignoreCase = ignoreCase
        self.stemmer = SnowballStemmer("english") if toStem else None
        self.idf_values = {}
        self.tfidf_vectors = {}
        nltk.download('stopwords')
        

    ####### Shared methods for both TF and IDF calculations #######
    def fileids(self):
        return self.corpus.fileids()

    def raw(self, fileids=None):
        return self.corpus.raw(fileids=fileids)

    def words(self, fileids=None):
        original_words = self.corpus.words(fileids=fileids)
        processed_words = self._preprocess_words(original_words)
        return processed_words

    ####### Methods for TF-IDF calculations #######
    def tfidf(self, fileid, returnZero=False):
        """Calculate TF-IDF for a specific document, optionally including terms with a TF-IDF value of 0."""
        # Ensure IDF values are calculated
        if not self.idf_values:
            self.idf1() # Ensure IDF values are calculated

        # Retrieve the processed words for the specified document
        words = self.words(fileids=fileid)
        # Calculate term frequency for these words
        tf = self._calculate_tf(words)

        # Calculate TF-IDF vector
        tfidf_vector = {}
        for word, tf_val in tf.items():
            idf_val = self.idf_values.get(word, 0)
            tfidf_score = tf_val * idf_val
            if tfidf_score > 0 or returnZero:
                tfidf_vector[word] = tfidf_score
            elif returnZero and idf_val == 0:  # Explicitly handle the case where a term is not in the corpus
                tfidf_vector[word] = 0

        return tfidf_vector


    def tfidfAll(self, returnZero=False):
        """Calculate and return TF-IDF vectors for all documents in the corpus."""
        self.idf1() # Ensure IDF values are calculated
        for fileid in self.fileids():
            self.tfidf_vectors[fileid] = self.tfidf(fileid)
        return self.tfidf_vectors

    def tfidfNew(self, words):
        """Calculate TF-IDF for a new document represented by a list of words."""
        if not self.idf_values:  # Ensure IDF values are calculated
            self._calculate_idf()
        processed_words = self._preprocess_words(words)
        tf = self._calculate_tf(processed_words)
        tfidf = {word: tf_val * self.idf_values.get(word, 0) for word, tf_val in tf.items()}
        return tfidf

    def idf1(self):
        """Return the IDF values for all terms in the corpus."""
        if not self.idf_values:  # Ensure IDF values are calculated
            self._calculate_idf()
        return self.idf_values
    
    # Cosine similarity requires comparing the TF-IDF vectors of two documents.
    def cosine_sim(self, fileid1, fileid2):
        """Calculate the cosine similarity between two documents."""
        tfidf1 = self.tfidf(fileid1)
        tfidf2 = self.tfidf(fileid2)
        
        # Calculate dot product
        dot_product = sum(tfidf1.get(word, 0) * tfidf2.get(word, 0) for word in tfidf1)
        
        # Calculate norms
        norm1 = math.sqrt(sum(val ** 2 for val in tfidf1.values()))
        norm2 = math.sqrt(sum(val ** 2 for val in tfidf2.values()))
        
        # Compute cosine similarity
        if norm1 * norm2 == 0:
            return 0  # Avoid division by zero
        else:
            return dot_product / (norm1 * norm2)
   

    def cosine_sim_new(self, words, fileid):
        """Calculate the cosine similarity between a new document (represented by a list of words)
        and the document specified by fileid."""
        # Calculate TF-IDF for the new document using the provided list of words
        tfidf_new_doc = self.tfidfNew(words)

        # Calculate TF-IDF for the existing document specified by fileid
        tfidf_existing_doc = self.tfidf(fileid)

        # Calculate dot product
        dot_product = sum(tfidf_new_doc.get(word, 0) * tfidf_existing_doc.get(word, 0) for word in tfidf_new_doc)

        # Calculate norms
        norm_new_doc = math.sqrt(sum(val ** 2 for val in tfidf_new_doc.values()))
        norm_existing_doc = math.sqrt(sum(val ** 2 for val in tfidf_existing_doc.values()))

        # Compute cosine similarity
        if norm_new_doc * norm_existing_doc == 0:
            return 0  # Avoid division by zero
        else:
            return dot_product / (norm_new_doc * norm_existing_doc)

    def query(self, words):
        """Return a list of (document, cosine_sim) tuples that calculate the cosine similarity between
        the 'new' document (specified by the list of words as the document) and each document in the corpus.
        The list is ordered in decreasing order of cosine similarity.
        """
        # Calculate TF-IDF for the new document represented by the list of words
        tfidf_new_doc = self.tfidfNew(words)

        # Calculate cosine similarity between the new document and each document in the corpus
        similarity_scores = []
        for fileid in self.fileids():
            cosine_similarity = self.cosine_sim(fileid, words)
            similarity_scores.append((fileid, cosine_similarity))

        # Sort the list by cosine similarity in decreasing order
        similarity_scores.sort(key=lambda x: x[1], reverse=True)

        return similarity_scores
    
    ####### Helper methods #######
    # Use dynamic preproccessing instead of preproccessing the whole corpus we leave the original corpus untouched
    # and preproccess the words on the on each method call
    def _preprocess_words(self, words):
        # Load stop words
        stop_words = set()
        if self.stopWord == "standard":
            stop_words = set(stopwords.words("english"))
        elif self.stopWord != "none":
            stop_words = self._load_stopwords_from_file(self.stopWord)

        # Case normalization
        if self.ignoreCase:
            words = [word.lower() for word in words]

        # Stem stop words if stemFirst is True
        if self.stemFirst and self.toStem:
            stop_words = {self.stemmer.stem(word) for word in stop_words}

        # Apply stemming and stop words removal
        if self.toStem:
            if self.stemFirst:  # Stem first, then remove stop words
                stemmed_words = [self.stemmer.stem(word) for word in words]
                return [word for word in stemmed_words if word not in stop_words]
            else:  # Remove stop words first, then stem
                filtered_words = [word for word in words if word not in stop_words]
                return [self.stemmer.stem(word) for word in filtered_words]
        else:
            return [word for word in words if word not in stop_words]
        
    def _calculate_tf(self, words):
        """Calculate term frequency for a list of words."""
        tf = {}
        for word in words:
            if word in tf:
                tf[word] += 1
            else:
                tf[word] = 1
        
        if self.tf_method == "log":
            for word, count in tf.items():
                tf[word] = 1 + math.log(count, 2)
        
        return tf
    
    def _calculate_idf(self):
        """Calculate inverse document frequency for all words in the corpus."""
        df = {}
        total_docs = len(self.fileids())
        for fileid in self.fileids():
            words = set(self.words(fileids=fileid))  # Ensure uniqueness
            for word in words:
                if word in df:
                    df[word] += 1
                else:
                    df[word] = 1
        
        for word, count in df.items():
            if self.idf_method == "smooth":
                self.idf_values[word] = math.log(1 + total_docs / (1 + count), 2)
            else:  # "base"
                self.idf_values[word] = math.log(total_docs / count, 2)
    
    def _load_stopwords_from_file(self, filename):
        """Load stopwords from a given file."""
        try:
            with open(filename, 'r') as file:
                stopwords_list = file.read().splitlines()
            return set(stopwords_list)
        except FileNotFoundError:
            print(f"Stopwords file {filename} not found.")
            return set()

    



In [30]:
from nltk.corpus import inaugural, PlaintextCorpusReader
import nltk

nltk.download('inaugural')
nltk.download('punkt')
myCorpus = CorpusReader_TFIDF(inaugural, tf="log", idf="smooth", stopWord="standard", toStem=True)


[nltk_data] Downloading package inaugural to
[nltk_data]     /Users/tango.tew/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tango.tew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tango.tew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:

print(len(inaugural.words()))
print(inaugural.sents())
print(len(inaugural.sents()))
print(inaugural.fileids())
print(inaugural.sents(['1789-washington.txt']))

152901
[['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':'], ['Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no', 'event', 'could', 'have', 'filled', 'me', 'with', 'greater', 'anxieties', 'than', 'that', 'of', 'which', 'the', 'notification', 'was', 'transmitted', 'by', 'your', 'order', ',', 'and', 'received', 'on', 'the', '14th', 'day', 'of', 'the', 'present', 'month', '.'], ...]
5220
['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt', '1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt', '1893-Cleveland.txt',

In [27]:
# raw = myCorpus.raw(fileids="1789-Washington.txt")
raw = myCorpus.raw(fileids="1789-Washington.txt")
print("Raw text:", raw[:100])

Raw text: Fellow-Citizens of the Senate and of the House of Representatives:

Among the vicissitudes incident 


In [10]:
print(myCorpus.words('1789-Washington.txt')[:10])

['fellow', '-', 'citizen', 'senat', 'hous', 'repres', ':', 'among', 'vicissitud', 'incid']


In [11]:
print(myCorpus.tfidf('1789-Washington.txt'))

{'fellow': 2.8665537662257115, '-': 1.0515303006400822, 'citizen': 3.4486102509384784, 'senat': 2.228818690495881, 'hous': 4.320929344386492, 'repres': 3.760836768494655, ':': 1.3519853287435422, 'among': 1.2270689085459212, 'vicissitud': 3.237039197300849, 'incid': 3.0660891904577725, 'life': 1.1243281350022014, 'event': 3.963705306579482, 'could': 3.68436213484302, 'fill': 2.228818690495881, 'greater': 1.5688428353578792, 'anxieti': 2.3025627700204314, 'notif': 4.930737337562887, 'transmit': 3.0660891904577725, 'order': 2.799861213777271, ',': 7.043212378658054, 'receiv': 1.9296106721086022, '14th': 4.930737337562887, 'day': 2.3813671232182196, 'present': 4.273010011243041, 'month': 2.786596361890807, '.': 5.393520816249033, 'one': 3.238303402483215, 'hand': 4.12652740527662, 'summon': 5.573192723781614, 'countri': 3.4052785294760732, 'whose': 3.1376856707157583, 'voic': 3.342754505077259, 'never': 2.313008971359982, 'hear': 2.6698513983076695, 'vener': 3.0660891904577725, 'love': 2.

In [31]:
q = myCorpus.tfidfAll()
for x in q:
   print(x, q[x])

1789-Washington.txt {'fellow': 2.8665537662257115, '-': 1.0515303006400822, 'citizen': 3.4486102509384784, 'senat': 2.228818690495881, 'hous': 4.320929344386492, 'repres': 3.760836768494655, ':': 1.3519853287435422, 'among': 1.2270689085459212, 'vicissitud': 3.237039197300849, 'incid': 3.0660891904577725, 'life': 1.1243281350022014, 'event': 3.963705306579482, 'could': 3.68436213484302, 'fill': 2.228818690495881, 'greater': 1.5688428353578792, 'anxieti': 2.3025627700204314, 'notif': 4.930737337562887, 'transmit': 3.0660891904577725, 'order': 2.799861213777271, ',': 7.043212378658054, 'receiv': 1.9296106721086022, '14th': 4.930737337562887, 'day': 2.3813671232182196, 'present': 4.273010011243041, 'month': 2.786596361890807, '.': 5.393520816249033, 'one': 3.238303402483215, 'hand': 4.12652740527662, 'summon': 5.573192723781614, 'countri': 3.4052785294760732, 'whose': 3.1376856707157583, 'voic': 3.342754505077259, 'never': 2.313008971359982, 'hear': 2.6698513983076695, 'vener': 3.06608919

In [35]:
d1 = "a1 a2 a2 a3"
d2 = "a3 a4 a3 a2 a2"    
d3 = "a1 a5 a2 a5 a5 "     
d4 = "a3 a3 a5 a5 a5"
d5 = "(a2 a3 a5)" 

nltk.download('inaugural')
# Define a list of words representing a new document
new_document_words = ['1973-Nixon.txt', '1977-Carter.txt', '1981-Reagan.txt']

# Call the query method to calculate cosine similarity
similarity_scores = myCorpus.query(new_document_words)
print(similarity_scores)
# # Print or inspect the similarity scores
# for document, cosine_sim in similarity_scores:
#     print(f"Document: {document}, Cosine Similarity: {cosine_sim}")

[nltk_data] Downloading package inaugural to
[nltk_data]     /Users/tango.tew/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!


[('1981-Reagan.txt', 0.806535043760335), ('1973-Nixon.txt', 0.6298438621110759), ('1977-Carter.txt', 0.6128445229593812), ('1985-Reagan.txt', 0.3804170226756742), ('1969-Nixon.txt', 0.36048397558746786), ('1997-Clinton.txt', 0.3569734924948946), ('2009-Obama.txt', 0.3463747092384255), ('1989-Bush.txt', 0.3413308665335078), ('2021-Biden.txt', 0.3277259432923014), ('1993-Clinton.txt', 0.3257257561335153), ('2013-Obama.txt', 0.32534622266433644), ('1961-Kennedy.txt', 0.3240205987936336), ('1953-Eisenhower.txt', 0.32224568409345583), ('2001-Bush.txt', 0.32030806331330797), ('1957-Eisenhower.txt', 0.30881939177373724), ('1937-Roosevelt.txt', 0.3071608522460664), ('1949-Truman.txt', 0.3042595553732829), ('1925-Coolidge.txt', 0.3019737973565337), ('1921-Harding.txt', 0.2947063195713381), ('1853-Pierce.txt', 0.29316184972591236), ('1929-Hoover.txt', 0.28826221458945833), ('1965-Johnson.txt', 0.28221528478242514), ('2005-Bush.txt', 0.28114567899173176), ('2017-Trump.txt', 0.2736464246479291), (

In [14]:
print(myCorpus.cosine_sim('1789-Washington.txt', '2021-Biden.txt'))

0.13522251841597055


In [18]:
print(myCorpus.cosine_sim_new(['citizens', 'economic', 'growth', 'economic'], '2021-Biden.txt')

0.0017670944271204185


In [37]:
from nltk.corpus import inaugural, PlaintextCorpusReader
from CorpusReader_TFIDF import *

print(len(inaugural.words()))
print(inaugural.sents())
print(len(inaugural.sents()))
print(inaugural.fileids())
print(inaugural.sents(['1789-washington.txt']))


myCorpus = CorpusReader_TFIDF(inaugural)
print(myCorpus.tfidf('1789-Washington.txt'))

# print("-----")

q = myCorpus.tfidfAll()
for x in q:
   print(x, q[x])

# print("-----")

print(myCorpus.cosine_sim('1789-Washington.txt', '2021-Biden.txt'))

# print("-----")

print(myCorpus.cosine_sim_new(['citizens', 'economic', 'growth', 'economic'], '2021-Biden.txt'))


#  This is for testing your own corpus
#
#  create a set of text files, store them in a directory specified from 'rootDir' variable
#
#  

rootDir = './test_files/'   # change that to the directory where the files are
newCorpus = PlaintextCorpusReader(rootDir, '.*')
tfidfCorpus = CorpusReader_TFIDF(newCorpus)

q = tfidfCorpus.tfidfAll()
for x in q:
   print(x, q[x])

# print("-----\n")




152901
[['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':'], ['Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no', 'event', 'could', 'have', 'filled', 'me', 'with', 'greater', 'anxieties', 'than', 'that', 'of', 'which', 'the', 'notification', 'was', 'transmitted', 'by', 'your', 'order', ',', 'and', 'received', 'on', 'the', '14th', 'day', 'of', 'the', 'present', 'month', '.'], ...]
5220
['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt', '1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt', '1893-Cleveland.txt',

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tango.tew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1789-Washington.txt {'fellow': 0.7163605787613494, '-': 0.12775554719837268, 'citizens': 0.6387777359918634, 'senate': 2.560714954474479, 'house': 5.425436095839058, 'representatives': 4.150576254608475, ':': 0.6731896837328915, 'among': 0.45637829465974333, 'vicissitudes': 3.5607149544744794, 'incident': 3.297680548640685, 'life': 0.26793320524663305, 'no': 0.39802428157679787, 'event': 5.425436095839058, 'could': 2.385540624334506, 'filled': 3.5607149544744794, 'me': 2.8726487464386263, 'with': 0.4192549219825736, 'greater': 1.024662054234269, 'anxieties': 4.2976805486406855, 'than': 0.9283355687918522, 'which': 1.7911092670955904, 'notification': 5.882643049361842, 'was': 1.1907221945627406, 'transmitted': 4.2976805486406855, 'your': 7.943787444256572, 'order': 2.049324108468538, 'received': 2.560714954474479, 'on': 0.746295527956496, '14th': 5.882643049361842, 'day': 1.194481660999186, 'present': 3.766800162084374, 'month': 4.882643049361842, 'one': 1.0717328209865322, 'hand': 4.07

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tango.tew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
