In [84]:
import nltk
from nltk.corpus import stopwords

In [85]:
with open("common_topics") as f:
    texts = [line.strip() for line in f.readlines()]
print(texts)

['Cats are the most popular pets in the world', 'Cats have nine lives', 'Cats are very independent animals', 'Cats are excellent hunters', 'Cats are very clean animals', 'Cats can purr to show contentment', 'Cats can hiss to show aggression', 'Cats have a very strong sense of smell', 'Cats can see in the dark', 'Cats are very playful animals']


In [86]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'wouldn', 'before', 'those', 'against', 'over', 'theirs', "aren't", "won't", 'needn', 'by', 'up', 'the', "it's", "that'll", 'didn', 'this', 'from', 'not', 'me', 'i', 'have', "weren't", 'now', 'under', 'shan', 'hers', "you're", 'how', "hasn't", 'any', 'had', 'while', 'both', 'that', 'so', "couldn't", 'should', 'been', 'why', 'its', 'yourself', 'out', 'only', 'an', "don't", 'hasn', 'shouldn', 'doesn', 'yours', 's', 'aren', 'him', 'very', 'couldn', 'down', "needn't", 'she', 'into', 'himself', 'herself', 'isn', 'hadn', 'here', 'doing', 'such', 'no', "wouldn't", 'most', "isn't", "didn't", "wasn't", 'a', 'you', 'being', 'he', "she's", 'or', 'did', 'don', 'has', 'his', 'there', 'nor', 'whom', 'just', 'as', 'then', 'and', 'through', 'haven', "hadn't", 'be', 'each', 'can', 'them', "you'll", 'off', 'they', 'my', 'weren', 'of', 'was', 'having', 't', 'on', 'will', 'o', 've', 'than', 'our', "haven't", 'are', 'y', 'during', "doesn't", 'in', 'mustn', 'some', 'myself', 'm', 'ours', 'own', 'do', 'her'

In [87]:
words_list = []
for text_item in texts:
    words = text_item.split()
    r_words = [word for word in words if word not in stop_words]
    words_list.append(r_words)

In [88]:
def calculate_tf(text):
    word_counts = {}
    total_words = len(text)
    for word in text:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
    tf_scores = {word: count / total_words for word, count in word_counts.items()}
    return tf_scores


tf_scores_list = [calculate_tf(text) for text in words_list]
print(tf_scores_list)

[{'Cats': 0.25, 'popular': 0.25, 'pets': 0.25, 'world': 0.25}, {'Cats': 0.3333333333333333, 'nine': 0.3333333333333333, 'lives': 0.3333333333333333}, {'Cats': 0.3333333333333333, 'independent': 0.3333333333333333, 'animals': 0.3333333333333333}, {'Cats': 0.3333333333333333, 'excellent': 0.3333333333333333, 'hunters': 0.3333333333333333}, {'Cats': 0.3333333333333333, 'clean': 0.3333333333333333, 'animals': 0.3333333333333333}, {'Cats': 0.25, 'purr': 0.25, 'show': 0.25, 'contentment': 0.25}, {'Cats': 0.25, 'hiss': 0.25, 'show': 0.25, 'aggression': 0.25}, {'Cats': 0.25, 'strong': 0.25, 'sense': 0.25, 'smell': 0.25}, {'Cats': 0.3333333333333333, 'see': 0.3333333333333333, 'dark': 0.3333333333333333}, {'Cats': 0.3333333333333333, 'playful': 0.3333333333333333, 'animals': 0.3333333333333333}]


In [89]:
import pandas as pd
tf_matrix = pd.DataFrame(tf_scores_list).fillna(0)
tf_matrix.to_csv("tf_scores.csv")
print(tf_matrix)

       Cats  popular  pets  world      nine     lives  independent   animals  \
0  0.250000     0.25  0.25   0.25  0.000000  0.000000     0.000000  0.000000   
1  0.333333     0.00  0.00   0.00  0.333333  0.333333     0.000000  0.000000   
2  0.333333     0.00  0.00   0.00  0.000000  0.000000     0.333333  0.333333   
3  0.333333     0.00  0.00   0.00  0.000000  0.000000     0.000000  0.000000   
4  0.333333     0.00  0.00   0.00  0.000000  0.000000     0.000000  0.333333   
5  0.250000     0.00  0.00   0.00  0.000000  0.000000     0.000000  0.000000   
6  0.250000     0.00  0.00   0.00  0.000000  0.000000     0.000000  0.000000   
7  0.250000     0.00  0.00   0.00  0.000000  0.000000     0.000000  0.000000   
8  0.333333     0.00  0.00   0.00  0.000000  0.000000     0.000000  0.000000   
9  0.333333     0.00  0.00   0.00  0.000000  0.000000     0.000000  0.333333   

   excellent   hunters  ...  show  contentment  hiss  aggression  strong  \
0   0.000000  0.000000  ...  0.00         0

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
filtered_text = [" ".join(lst) for lst in words_list]
X = vectorizer.fit_transform(filtered_text)
tfidf_scores = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_scores.to_csv("tfidf_scores.csv")
print(tfidf_scores)

   aggression   animals      cats    clean  contentment      dark  excellent  \
0     0.00000  0.000000  0.208755  0.00000      0.00000  0.000000   0.000000   
1     0.00000  0.000000  0.252931  0.00000      0.00000  0.000000   0.000000   
2     0.00000  0.572129  0.284415  0.00000      0.00000  0.000000   0.000000   
3     0.00000  0.000000  0.252931  0.00000      0.00000  0.000000   0.684115   
4     0.00000  0.572129  0.284415  0.76927      0.00000  0.000000   0.000000   
5     0.00000  0.000000  0.218645  0.00000      0.59138  0.000000   0.000000   
6     0.59138  0.000000  0.218645  0.00000      0.00000  0.000000   0.000000   
7     0.00000  0.000000  0.208755  0.00000      0.00000  0.000000   0.000000   
8     0.00000  0.000000  0.252931  0.00000      0.00000  0.684115   0.000000   
9     0.00000  0.572129  0.284415  0.00000      0.00000  0.000000   0.000000   

      hiss   hunters  independent  ...     pets  playful  popular     purr  \
0  0.00000  0.000000      0.00000  ...  0

In [91]:
from nltk.util import bigrams
from collections import Counter
import math
texts

['Cats are the most popular pets in the world',
 'Cats have nine lives',
 'Cats are very independent animals',
 'Cats are excellent hunters',
 'Cats are very clean animals',
 'Cats can purr to show contentment',
 'Cats can hiss to show aggression',
 'Cats have a very strong sense of smell',
 'Cats can see in the dark',
 'Cats are very playful animals']

In [107]:
bigrams_list = [list(bigrams(text.split())) for text in texts]
bigram_counts = Counter(bigram for bigrams in bigrams_list for bigram in bigrams)
unigram_counts = Counter(word for words in texts for word in words.split())
print(bigrams_list)
print(bigram_counts)
print(unigram_counts)

[[('Cats', 'are'), ('are', 'the'), ('the', 'most'), ('most', 'popular'), ('popular', 'pets'), ('pets', 'in'), ('in', 'the'), ('the', 'world')], [('Cats', 'have'), ('have', 'nine'), ('nine', 'lives')], [('Cats', 'are'), ('are', 'very'), ('very', 'independent'), ('independent', 'animals')], [('Cats', 'are'), ('are', 'excellent'), ('excellent', 'hunters')], [('Cats', 'are'), ('are', 'very'), ('very', 'clean'), ('clean', 'animals')], [('Cats', 'can'), ('can', 'purr'), ('purr', 'to'), ('to', 'show'), ('show', 'contentment')], [('Cats', 'can'), ('can', 'hiss'), ('hiss', 'to'), ('to', 'show'), ('show', 'aggression')], [('Cats', 'have'), ('have', 'a'), ('a', 'very'), ('very', 'strong'), ('strong', 'sense'), ('sense', 'of'), ('of', 'smell')], [('Cats', 'can'), ('can', 'see'), ('see', 'in'), ('in', 'the'), ('the', 'dark')], [('Cats', 'are'), ('are', 'very'), ('very', 'playful'), ('playful', 'animals')]]
Counter({('Cats', 'are'): 5, ('are', 'very'): 3, ('Cats', 'can'): 3, ('in', 'the'): 2, ('Cats

In [112]:
def calculate_pmi(bigram, unigram_counts):
    p_bigram = bigram_counts[bigram] / len(bigrams_list)
    p_word1 = unigram_counts[bigram[0]] / len(bigrams_list)
    p_word2 = unigram_counts[bigram[1]] / len(bigrams_list)
    return math.log2(p_bigram / (p_word1 * p_word2))


pmi_scores = [(bigram, calculate_pmi(bigram, unigram_counts)) for bigram in bigram_counts]
pmi_scores.sort(key=lambda x: x[1], reverse=True)
print("Top 5 PMI Scores:")
print(pmi_scores[:5])

Top 5 PMI Scores:
[(('most', 'popular'), 3.321928094887362), (('popular', 'pets'), 3.321928094887362), (('nine', 'lives'), 3.321928094887362), (('excellent', 'hunters'), 3.321928094887362), (('strong', 'sense'), 3.321928094887362)]
