In [None]:
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import numpy as np
import warnings
import sys
import re
warnings.filterwarnings("ignore")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy
from nltk.util import ngrams
from nltk.lm import Vocabulary
from nltk.lm.preprocessing import pad_both_ends

In [None]:
file_path = '/content/cnbc_headlines.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Headlines,Time,Description
0,Jim Cramer: A better way to invest in the Covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying..."
1,Cramer's lightning round: I would own Teradyne,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin..."
2,,,
3,"Cramer's week ahead: Big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co..."
4,IQ Capital CEO Keith Bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be..."


In [None]:
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'<.*?>', '', text)
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
df['cleaned_headline'] = df['Headlines'].apply(clean_text)

In [None]:
df.head()

Unnamed: 0,Headlines,Time,Description,cleaned_headline
0,Jim Cramer: A better way to invest in the Covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying...",jim cramer a better way to invest in the covid...
1,Cramer's lightning round: I would own Teradyne,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin...",cramers lightning round i would own teradyne
2,,,,
3,"Cramer's week ahead: Big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co...",cramers week ahead big week for earnings even ...
4,IQ Capital CEO Keith Bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be...",iq capital ceo keith bliss says tech and healt...


In [None]:
df.shape

(3080, 4)

In [None]:
df.isnull().sum()

Headlines           280
Time                280
Description         280
cleaned_headline    280
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

(2800, 4)

In [None]:
df.duplicated().sum()

0

In [None]:
#df.drop_duplicates(inplace=True)

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
df['tokenized_headlines'] = df['cleaned_headline'].apply(lambda x: word_tokenize(x))



In [None]:
df[['Headlines', 'tokenized_headlines']].head()

Unnamed: 0,Headlines,tokenized_headlines
0,Jim Cramer: A better way to invest in the Covi...,"[jim, cramer, a, better, way, to, invest, in, ..."
1,Cramer's lightning round: I would own Teradyne,"[cramers, lightning, round, i, would, own, ter..."
3,"Cramer's week ahead: Big week for earnings, ev...","[cramers, week, ahead, big, week, for, earning..."
4,IQ Capital CEO Keith Bliss says tech and healt...,"[iq, capital, ceo, keith, bliss, says, tech, a..."
5,Wall Street delivered the 'kind of pullback I'...,"[wall, street, delivered, the, kind, of, pullb..."


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [None]:
df['cleaned_tokens'] = df['tokenized_headlines'].apply(remove_stop_words)
df[['tokenized_headlines', 'cleaned_tokens']].head()

Unnamed: 0,tokenized_headlines,cleaned_tokens
0,"[jim, cramer, a, better, way, to, invest, in, ...","[jim, cramer, better, way, invest, covid19, va..."
1,"[cramers, lightning, round, i, would, own, ter...","[cramers, lightning, round, would, teradyne]"
3,"[cramers, week, ahead, big, week, for, earning...","[cramers, week, ahead, big, week, earnings, ev..."
4,"[iq, capital, ceo, keith, bliss, says, tech, a...","[iq, capital, ceo, keith, bliss, says, tech, h..."
5,"[wall, street, delivered, the, kind, of, pullb...","[wall, street, delivered, kind, pullback, ive,..."


In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def perform_lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

def perform_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

In [None]:
df['lemmatized_tokens'] = df['cleaned_tokens'].apply(perform_lemmatization)
df['stemmed_tokens'] = df['cleaned_tokens'].apply(perform_stemming)

In [None]:
df[['cleaned_tokens', 'lemmatized_tokens', 'stemmed_tokens']]

Unnamed: 0,cleaned_tokens,lemmatized_tokens,stemmed_tokens
0,"[jim, cramer, better, way, invest, covid19, va...","[jim, cramer, better, way, invest, covid19, va...","[jim, cramer, better, way, invest, covid19, va..."
1,"[cramers, lightning, round, would, teradyne]","[cramers, lightning, round, would, teradyne]","[cramer, lightn, round, would, teradyn]"
3,"[cramers, week, ahead, big, week, earnings, ev...","[cramers, week, ahead, big, week, earnings, ev...","[cramer, week, ahead, big, week, earn, even, b..."
4,"[iq, capital, ceo, keith, bliss, says, tech, h...","[iq, capital, ceo, keith, bliss, say, tech, he...","[iq, capit, ceo, keith, bliss, say, tech, heal..."
5,"[wall, street, delivered, kind, pullback, ive,...","[wall, street, delivered, kind, pullback, ive,...","[wall, street, deliv, kind, pullback, ive, wai..."
...,...,...,...
3075,"[markets, lack, christmas, cheer]","[market, lack, christmas, cheer]","[market, lack, christma, cheer]"
3076,"[cramer, remix, biggest, mistake, make, taxes,...","[cramer, remix, biggest, mistake, make, tax, s...","[cramer, remix, biggest, mistak, make, tax, st..."
3077,"[cramer, says, owning, many, stocks, little, c...","[cramer, say, owning, many, stock, little, cas...","[cramer, say, own, mani, stock, littl, cash, s..."
3078,"[cramer, helped, investors, 2010, flash, crash...","[cramer, helped, investor, 2010, flash, crash,...","[cramer, help, investor, 2010, flash, crash, f..."


In [None]:
df

Unnamed: 0,Headlines,Time,Description,cleaned_headline,tokenized_headlines,cleaned_tokens,lemmatized_tokens,stemmed_tokens
0,Jim Cramer: A better way to invest in the Covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying...",jim cramer a better way to invest in the covid...,"[jim, cramer, a, better, way, to, invest, in, ...","[jim, cramer, better, way, invest, covid19, va...","[jim, cramer, better, way, invest, covid19, va...","[jim, cramer, better, way, invest, covid19, va..."
1,Cramer's lightning round: I would own Teradyne,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin...",cramers lightning round i would own teradyne,"[cramers, lightning, round, i, would, own, ter...","[cramers, lightning, round, would, teradyne]","[cramers, lightning, round, would, teradyne]","[cramer, lightn, round, would, teradyn]"
3,"Cramer's week ahead: Big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co...",cramers week ahead big week for earnings even ...,"[cramers, week, ahead, big, week, for, earning...","[cramers, week, ahead, big, week, earnings, ev...","[cramers, week, ahead, big, week, earnings, ev...","[cramer, week, ahead, big, week, earn, even, b..."
4,IQ Capital CEO Keith Bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be...",iq capital ceo keith bliss says tech and healt...,"[iq, capital, ceo, keith, bliss, says, tech, a...","[iq, capital, ceo, keith, bliss, says, tech, h...","[iq, capital, ceo, keith, bliss, say, tech, he...","[iq, capit, ceo, keith, bliss, say, tech, heal..."
5,Wall Street delivered the 'kind of pullback I'...,"7:36 PM ET Thu, 16 July 2020","""Look for the stocks of high-quality companies...",wall street delivered the kind of pullback ive...,"[wall, street, delivered, the, kind, of, pullb...","[wall, street, delivered, kind, pullback, ive,...","[wall, street, delivered, kind, pullback, ive,...","[wall, street, deliv, kind, pullback, ive, wai..."
...,...,...,...,...,...,...,...,...
3075,Markets lack Christmas cheer,"10:15 AM ET Tue, 26 Dec 2017","According to Kensho, here's how markets have f...",markets lack christmas cheer,"[markets, lack, christmas, cheer]","[markets, lack, christmas, cheer]","[market, lack, christmas, cheer]","[market, lack, christma, cheer]"
3076,Cramer Remix: The biggest mistake you can make...,"11:12 AM ET Thu, 20 Sept 2018",Jim Cramer revealed his top rule when it comes...,cramer remix the biggest mistake you can make ...,"[cramer, remix, the, biggest, mistake, you, ca...","[cramer, remix, biggest, mistake, make, taxes,...","[cramer, remix, biggest, mistake, make, tax, s...","[cramer, remix, biggest, mistak, make, tax, st..."
3077,Cramer says owning too many stocks and too lit...,"7:07 PM ET Fri, 22 Dec 2017",Jim Cramer broke down why owning fewer stocks ...,cramer says owning too many stocks and too lit...,"[cramer, says, owning, too, many, stocks, and,...","[cramer, says, owning, many, stocks, little, c...","[cramer, say, owning, many, stock, little, cas...","[cramer, say, own, mani, stock, littl, cash, s..."
3078,Cramer: I helped investors through the 2010 fl...,"7:07 PM ET Fri, 22 Dec 2017","Jim Cramer built on his ""nobody ever made a di...",cramer i helped investors through the 2010 fla...,"[cramer, i, helped, investors, through, the, 2...","[cramer, helped, investors, 2010, flash, crash...","[cramer, helped, investor, 2010, flash, crash,...","[cramer, help, investor, 2010, flash, crash, f..."


In [None]:
nlp = spacy.load("en_core_web_sm")
def label_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [None]:
df['entities'] = df['cleaned_headline'].apply(label_entities)

In [None]:
df[['cleaned_headline', 'entities']]

Unnamed: 0,cleaned_headline,entities
0,jim cramer a better way to invest in the covid...,"[(jim cramer, PERSON), (covid19, PERSON)]"
1,cramers lightning round i would own teradyne,[]
3,cramers week ahead big week for earnings even ...,"[(big week, DATE), (bigger week, DATE)]"
4,iq capital ceo keith bliss says tech and healt...,"[(healthcare, ORG)]"
5,wall street delivered the kind of pullback ive...,"[(jim cramer, PERSON)]"
...,...,...
3075,markets lack christmas cheer,"[(christmas, DATE)]"
3076,cramer remix the biggest mistake you can make ...,[]
3077,cramer says owning too many stocks and too lit...,[]
3078,cramer i helped investors through the 2010 fla...,"[(2010, DATE), (one, CARDINAL)]"


In [None]:
""" Load the small English model with basic NER
nlp = spacy.load("en_core_web_sm")

# Load the medium-sized English model with basic NER and word vectors
nlp = spacy.load("en_core_web_md")

# Load the large English model with basic NER and word vectors
nlp = spacy.load("en_core_web_lg")

# Load a multilingual model (e.g., Spanish)
nlp = spacy.load("es_core_news_sm")

# Load a domain-specific model (e.g., medical)
nlp = spacy.load("en_core_med7_lg")"""

' Load the small English model with basic NER\nnlp = spacy.load("en_core_web_sm")\n\n# Load the medium-sized English model with basic NER and word vectors\nnlp = spacy.load("en_core_web_md")\n\n# Load the large English model with basic NER and word vectors\nnlp = spacy.load("en_core_web_lg")\n\n# Load a multilingual model (e.g., Spanish)\nnlp = spacy.load("es_core_news_sm")\n\n# Load a domain-specific model (e.g., medical)\nnlp = spacy.load("en_core_med7_lg")'

In [None]:
def pos_tagging(tokens):
    pos_tags = nltk.pos_tag(tokens)
    return pos_tags


In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
df['pos_tags'] = df['tokenized_headlines'].apply(pos_tagging)


In [None]:
df[['tokenized_headlines', 'pos_tags']].head()

Unnamed: 0,tokenized_headlines,pos_tags
0,"[jim, cramer, a, better, way, to, invest, in, ...","[(jim, NN), (cramer, VBZ), (a, DT), (better, J..."
1,"[cramers, lightning, round, i, would, own, ter...","[(cramers, NNS), (lightning, VBG), (round, NN)..."
3,"[cramers, week, ahead, big, week, for, earning...","[(cramers, NNS), (week, NN), (ahead, RB), (big..."
4,"[iq, capital, ceo, keith, bliss, says, tech, a...","[(iq, JJ), (capital, NN), (ceo, NN), (keith, N..."
5,"[wall, street, delivered, the, kind, of, pullb...","[(wall, JJ), (street, NN), (delivered, VBD), (..."


In [None]:
def pos_tagging(tokens):
    text = ' '.join(tokens)
    doc = nlp(text)
    tagged_text = [(token.text, token.pos_) for token in doc]
    return tagged_text

df['pos_tags_1'] = df['tokenized_headlines'].apply(pos_tagging)
df[['tokenized_headlines', 'pos_tags_1']].head()

Unnamed: 0,tokenized_headlines,pos_tags_1
0,"[jim, cramer, a, better, way, to, invest, in, ...","[(jim, PROPN), (cramer, PROPN), (a, DET), (bet..."
1,"[cramers, lightning, round, i, would, own, ter...","[(cramers, NOUN), (lightning, NOUN), (round, A..."
3,"[cramers, week, ahead, big, week, for, earning...","[(cramers, NOUN), (week, VERB), (ahead, ADV), ..."
4,"[iq, capital, ceo, keith, bliss, says, tech, a...","[(iq, PROPN), (capital, PROPN), (ceo, PROPN), ..."
5,"[wall, street, delivered, the, kind, of, pullb...","[(wall, PROPN), (street, PROPN), (delivered, V..."


In [None]:
def pos_tagging(text):
    tokens = nltk.word_tokenize(text)
    text = ' '.join(tokens)
    doc = nlp(text)
    tagged_text = [(token.text, token.pos_) for token in doc]
    return tagged_text

df['pos_tags_2'] = df['Headlines'].apply(pos_tagging)
df[['Headlines', 'pos_tags_2']].head()

Unnamed: 0,Headlines,pos_tags_2
0,Jim Cramer: A better way to invest in the Covi...,"[(Jim, PROPN), (Cramer, PROPN), (:, PUNCT), (A..."
1,Cramer's lightning round: I would own Teradyne,"[(Cramer, PROPN), ('s, PART), (lightning, NOUN..."
3,"Cramer's week ahead: Big week for earnings, ev...","[(Cramer, PROPN), ('s, PART), (week, NOUN), (a..."
4,IQ Capital CEO Keith Bliss says tech and healt...,"[(IQ, PROPN), (Capital, PROPN), (CEO, PROPN), ..."
5,Wall Street delivered the 'kind of pullback I'...,"[(Wall, PROPN), (Street, PROPN), (delivered, V..."


In [None]:
df

Unnamed: 0,Headlines,Time,Description,cleaned_headline,tokenized_headlines,cleaned_tokens,lemmatized_tokens,stemmed_tokens,entities,pos_tags,pos_tags_1,pos_tags_2
0,Jim Cramer: A better way to invest in the Covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying...",jim cramer a better way to invest in the covid...,"[jim, cramer, a, better, way, to, invest, in, ...","[jim, cramer, better, way, invest, covid19, va...","[jim, cramer, better, way, invest, covid19, va...","[jim, cramer, better, way, invest, covid19, va...","[(jim cramer, PERSON), (covid19, PERSON)]","[(jim, NN), (cramer, VBZ), (a, DT), (better, J...","[(jim, PROPN), (cramer, PROPN), (a, DET), (bet...","[(Jim, PROPN), (Cramer, PROPN), (:, PUNCT), (A..."
1,Cramer's lightning round: I would own Teradyne,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin...",cramers lightning round i would own teradyne,"[cramers, lightning, round, i, would, own, ter...","[cramers, lightning, round, would, teradyne]","[cramers, lightning, round, would, teradyne]","[cramer, lightn, round, would, teradyn]",[],"[(cramers, NNS), (lightning, VBG), (round, NN)...","[(cramers, NOUN), (lightning, NOUN), (round, A...","[(Cramer, PROPN), ('s, PART), (lightning, NOUN..."
3,"Cramer's week ahead: Big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co...",cramers week ahead big week for earnings even ...,"[cramers, week, ahead, big, week, for, earning...","[cramers, week, ahead, big, week, earnings, ev...","[cramers, week, ahead, big, week, earnings, ev...","[cramer, week, ahead, big, week, earn, even, b...","[(big week, DATE), (bigger week, DATE)]","[(cramers, NNS), (week, NN), (ahead, RB), (big...","[(cramers, NOUN), (week, VERB), (ahead, ADV), ...","[(Cramer, PROPN), ('s, PART), (week, NOUN), (a..."
4,IQ Capital CEO Keith Bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be...",iq capital ceo keith bliss says tech and healt...,"[iq, capital, ceo, keith, bliss, says, tech, a...","[iq, capital, ceo, keith, bliss, says, tech, h...","[iq, capital, ceo, keith, bliss, say, tech, he...","[iq, capit, ceo, keith, bliss, say, tech, heal...","[(healthcare, ORG)]","[(iq, JJ), (capital, NN), (ceo, NN), (keith, N...","[(iq, PROPN), (capital, PROPN), (ceo, PROPN), ...","[(IQ, PROPN), (Capital, PROPN), (CEO, PROPN), ..."
5,Wall Street delivered the 'kind of pullback I'...,"7:36 PM ET Thu, 16 July 2020","""Look for the stocks of high-quality companies...",wall street delivered the kind of pullback ive...,"[wall, street, delivered, the, kind, of, pullb...","[wall, street, delivered, kind, pullback, ive,...","[wall, street, delivered, kind, pullback, ive,...","[wall, street, deliv, kind, pullback, ive, wai...","[(jim cramer, PERSON)]","[(wall, JJ), (street, NN), (delivered, VBD), (...","[(wall, PROPN), (street, PROPN), (delivered, V...","[(Wall, PROPN), (Street, PROPN), (delivered, V..."
...,...,...,...,...,...,...,...,...,...,...,...,...
3075,Markets lack Christmas cheer,"10:15 AM ET Tue, 26 Dec 2017","According to Kensho, here's how markets have f...",markets lack christmas cheer,"[markets, lack, christmas, cheer]","[markets, lack, christmas, cheer]","[market, lack, christmas, cheer]","[market, lack, christma, cheer]","[(christmas, DATE)]","[(markets, NNS), (lack, VBP), (christmas, NNS)...","[(markets, NOUN), (lack, VERB), (christmas, PR...","[(Markets, NOUN), (lack, VERB), (Christmas, PR..."
3076,Cramer Remix: The biggest mistake you can make...,"11:12 AM ET Thu, 20 Sept 2018",Jim Cramer revealed his top rule when it comes...,cramer remix the biggest mistake you can make ...,"[cramer, remix, the, biggest, mistake, you, ca...","[cramer, remix, biggest, mistake, make, taxes,...","[cramer, remix, biggest, mistake, make, tax, s...","[cramer, remix, biggest, mistak, make, tax, st...",[],"[(cramer, NN), (remix, VBZ), (the, DT), (bigge...","[(cramer, PROPN), (remix, VERB), (the, DET), (...","[(Cramer, PROPN), (Remix, PROPN), (:, PUNCT), ..."
3077,Cramer says owning too many stocks and too lit...,"7:07 PM ET Fri, 22 Dec 2017",Jim Cramer broke down why owning fewer stocks ...,cramer says owning too many stocks and too lit...,"[cramer, says, owning, too, many, stocks, and,...","[cramer, says, owning, many, stocks, little, c...","[cramer, say, owning, many, stock, little, cas...","[cramer, say, own, mani, stock, littl, cash, s...",[],"[(cramer, NN), (says, VBZ), (owning, VBG), (to...","[(cramer, PROPN), (says, VERB), (owning, VERB)...","[(Cramer, PROPN), (says, VERB), (owning, VERB)..."
3078,Cramer: I helped investors through the 2010 fl...,"7:07 PM ET Fri, 22 Dec 2017","Jim Cramer built on his ""nobody ever made a di...",cramer i helped investors through the 2010 fla...,"[cramer, i, helped, investors, through, the, 2...","[cramer, helped, investors, 2010, flash, crash...","[cramer, helped, investor, 2010, flash, crash,...","[cramer, help, investor, 2010, flash, crash, f...","[(2010, DATE), (one, CARDINAL)]","[(cramer, NN), (i, NN), (helped, VBD), (invest...","[(cramer, PROPN), (i, PRON), (helped, VERB), (...","[(Cramer, NOUN), (:, PUNCT), (I, PRON), (helpe..."


In [None]:
def generate_word_ngrams(tokens, n):
    n_grams = list(ngrams(tokens, n))
    return [' '.join(gram) for gram in n_grams]


In [None]:
n=2
df['word_ngrams'] = df['cleaned_tokens'].apply(lambda x: generate_word_ngrams(x, n))

In [None]:
df[['cleaned_tokens', 'word_ngrams']].head()

Unnamed: 0,cleaned_tokens,word_ngrams
0,"[jim, cramer, better, way, invest, covid19, va...","[jim cramer, cramer better, better way, way in..."
1,"[cramers, lightning, round, would, teradyne]","[cramers lightning, lightning round, round wou..."
3,"[cramers, week, ahead, big, week, earnings, ev...","[cramers week, week ahead, ahead big, big week..."
4,"[iq, capital, ceo, keith, bliss, says, tech, h...","[iq capital, capital ceo, ceo keith, keith bli..."
5,"[wall, street, delivered, kind, pullback, ive,...","[wall street, street delivered, delivered kind..."


In [None]:
word_ngrams_data = df['word_ngrams'].tolist()

In [None]:
from nltk.lm import MLE
from nltk.util import ngrams
from nltk.lm.preprocessing import pad_both_ends

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import math


In [None]:
corpus = [' '.join(tokens) for tokens in df['cleaned_tokens']]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

vocabulary = vectorizer.get_feature_names_out()
word_counts = X.toarray().sum(axis=0)
total_words = word_counts.sum()

perplexity = 1.0
for word, count in zip(vocabulary, word_counts):
    word_probability = count / total_words
    perplexity *= math.pow(1.0 / word_probability, word_probability)

perplexity = pow(perplexity, 1 / total_words)

perplexity

1.0002795460449323

In [None]:
from collections import Counter

# Assuming 'word_ngrams' is your word n-grams column
bigram_counts = Counter(bigram for bigram_list in df['word_ngrams'] for bigram in bigram_list)

In [None]:
bigram_counts

Counter({'jim cramer': 480,
         'cramer better': 1,
         'better way': 2,
         'way invest': 1,
         'invest covid19': 1,
         'covid19 vaccine': 1,
         'vaccine gold': 1,
         'gold rush': 1,
         'cramers lightning': 474,
         'lightning round': 474,
         'round would': 6,
         'would teradyne': 1,
         'cramers week': 38,
         'week ahead': 41,
         'ahead big': 2,
         'big week': 1,
         'week earnings': 4,
         'earnings even': 1,
         'even bigger': 1,
         'bigger week': 1,
         'week vaccines': 1,
         'iq capital': 1,
         'capital ceo': 1,
         'ceo keith': 1,
         'keith bliss': 1,
         'bliss says': 1,
         'says tech': 4,
         'tech healthcare': 1,
         'healthcare rally': 1,
         'wall street': 46,
         'street delivered': 1,
         'delivered kind': 1,
         'kind pullback': 1,
         'pullback ive': 1,
         'ive waiting': 1,
         'wai