<a href="https://colab.research.google.com/github/Siqi-SN/2025-Fall---Language-Analytics/blob/main/Dict/MOOC_tf_gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Term Frequency



In [None]:
!pip install gensim # the package is not available in colab
import gensim



In [None]:
from google.colab import drive
import pandas as pd
# Below will prompt for authorization
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load the dataframe and take a look at it, make sure everything looks good

In [None]:
# Read in .csv

df = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/Data/MOOC_reviews/balanced_review_final.csv",
    encoding="utf-8", # cp1252 is a common encoding for Windows systems. There are some errors from switching from mac to windows machines
    )[['reviews', 'label']] # selects only the columns named 'text' and 'Prompt'

# We always want to look at the information about the dataset as a sanity check.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19461 entries, 0 to 19460
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   reviews  19461 non-null  object
 1   label    19461 non-null  object
dtypes: object(2)
memory usage: 304.2+ KB


In [None]:
# Look at the value counts of the outcome variable. Serious class imbalances can be a problem, but this looks fine.
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
NEG,9744
POS,9717


In [None]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# preprocess words
def get_words(docs):
    tokens = []
    for token in docs:
        #  keep only alphabetic lemmas
        # if not token.is_punct and not token.is_space and token.is_alpha:
            # tokens.append(token.lemma_.lower())
        # if we wanted to remove stopwords (this is best practice)
        if not token.is_punct and not token.is_space and not token.is_stop and token.is_alpha:
            tokens.append(token.lemma_.lower())
    return tokens

# get the lemmas, reduce the matrix, aggregate the semantic words


In [None]:
mooc_docs = list(nlp.pipe(df.reviews)) #spacy the texts using pipeline

In [None]:
# process docs
processed_docs = [get_words(doc) for doc in mooc_docs]

# print out first two
count = 0
for i, doc in enumerate(processed_docs):
    if count < 2:
        print(f"Doc {i}: {doc}")
        count += 1
    else:
        break

Doc 0: ['great', 'course', 'learn', 'ton', 'presenter', 'great', 'job', 'reason', 'course', 'star', 'lot', 'topic', 'cover', 'detail', 'help', 'understand', 'lot', 'topic', 'give', 'high', 'level', 'overview', 'research', 'topic', 'website', 'understand', 'great', 'course']
Doc 1: ['course', 'lay', 'need', 'familiar', 'class', 'service', 'fundamentals', 'gcp', 'infrastructure', 'provide', 'lab', 'help', 'navigate', 'adjust', 'google', 'interface', 'focus', 'responsive', 'pod', 'time', 'work', 'internal', 'question', 'give', 'sense', 'comfort', 'critical', 'piece']


## Filter Tokens by POS Tags

In [None]:
def get_adj_words(docs):
    tokens = []
    for token in docs:
        if not token.is_punct and not token.is_space and not token.is_stop and token.is_alpha and token.pos_ in ['ADJ']:
            tokens.append(token.lemma_.lower())
    return tokens

def get_adv_words(docs):
    tokens = []
    for token in docs:
        if not token.is_punct and not token.is_space and not token.is_stop and token.is_alpha and token.pos_ in ['ADV']:
            tokens.append(token.lemma_.lower())
    return tokens


def get_verb_words(docs):
    tokens = []
    for token in docs:
        if not token.is_punct and not token.is_space and not token.is_stop and token.is_alpha and token.pos_ in ['VERB']:
            tokens.append(token.lemma_.lower())
    return tokens

# process docs with the new POS filtering
adj_docs = [get_adj_words(doc) for doc in mooc_docs]
adv_docs = [get_adv_words(doc) for doc in mooc_docs]
verb_docs = [get_verb_words(doc) for doc in mooc_docs]


In [None]:
# print out first two to verify
count = 0
for i, doc in enumerate(verb_docs):
    if count < 4:
        print(f"Doc {i}: {doc}")
        count += 1
    else:
        break

Doc 0: ['learn', 'cover', 'help', 'understand', 'give', 'research', 'understand']
Doc 1: ['lay', 'need', 'provide', 'help', 'navigate', 'adjust', 'focus', 'work', 'give']
Doc 2: ['cod', 'spend', 'lay', 'learn', 'cod', 'wait', 'provide', 'summarize', 'cover', 'provide', 'cod']
Doc 3: ['find', 'suppose', 'read', 'go']


## ADJ

In [None]:
import gensim

# create empty dictionary from gensim library (for the vocabulary)
adj_dictionary = gensim.corpora.Dictionary()

# iteratively add each doc to the bag-of-words corpus
adj_bow_corpus = [adj_dictionary.doc2bow(doc, allow_update=True) for doc in adj_docs]

print(f'number of texts: {len(adj_bow_corpus)}')
print(f'number of words: {len(adj_dictionary)}')

# what is inside? Print first two documents from the new corpus
count = 0
for doc in adj_bow_corpus:
  if count < 2:
    print([[adj_dictionary[id], freq] for id, freq in doc])
    count += 1
  else:
      break

number of texts: 19461
number of words: 3579
[['great', 3], ['high', 1]]
[['critical', 1], ['familiar', 1], ['internal', 1], ['responsive', 1]]


In [None]:
import numpy as np
import pandas as pd

# Get vocab from dictionary
pos_vocab = pd.Series(adj_dictionary.token2id).index

print(f'how many words: {len(pos_vocab)}')

# Sequence of numbers as long as the documents
num_docs_pos = len(adj_bow_corpus)
index_pos = range(num_docs_pos)
print(f'how many documents: {num_docs_pos}')

# Create a pandas dataframe full of zeros.
pos_bow_df = pd.DataFrame(data=np.zeros((num_docs_pos, len(adj_dictionary)), dtype=np.float16),
                  index=index_pos,
                  columns=pos_vocab)

# Add each word from the bag-of-words corpus to the matrix.
for idx in index_pos:
  for id, freq in adj_bow_corpus[idx]:
    pos_bow_df.loc[idx, adj_dictionary[id]] = freq

# Display the DataFrame head
display(pos_bow_df.head())

how many words: 3579
how many documents: 19461


  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,great,high,critical,familiar,internal,responsive,decent,dirty,good,introductory,...,heinous,jelly,bt,uninstall,cardinal,overcooked,risky,lecturersno,pseudo,invasive
0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
adj_tf_df = pos_bow_df.div(pos_bow_df.sum(axis='columns'), axis='rows')

display(adj_tf_df.head())

  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,great,high,critical,familiar,internal,responsive,decent,dirty,good,introductory,...,heinous,jelly,bt,uninstall,cardinal,overcooked,risky,lecturersno,pseudo,invasive
0,0.75,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.25,0.25,0.25,0.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.166626,0.166626,0.166626,0.166626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.142822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.090881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090881,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import pandas as pd

# Add the 'label' column to the pos_tf_df DataFrame
adj_tf_df['label'] = df['label']

# Calculate the mean TF value for each word per label
mean_tf_by_label = adj_tf_df.groupby('label').mean()

# Get top 20 words for each label based on means
top_words_by_label = {} # dictionary
for label_value in mean_tf_by_label.index: # go through mean values
    # Sort in descending order and select the top 20, excluding the 'label' column itself if it were numerical
    top_words = mean_tf_by_label.loc[label_value].drop('label', errors='ignore').sort_values(ascending=False).head(20)
    top_words_by_label[label_value] = top_words # send to dictionary

# Show words by label in a pretty way
print("Top 20 words (verbs, adjectives, adverbs) for each label based on mean term frequency:")
for label_value, top_words in top_words_by_label.items():
    print(f"\nLabel: {label_value}")
    display(top_words)

Top 20 words (verbs, adjectives, adverbs) for each label based on mean term frequency:

Label: NEG


Unnamed: 0,NEG
good,0.041655
bad,0.022554
well,0.017781
difficult,0.016982
final,0.016557
great,0.016241
basic,0.016159
little,0.016029
hard,0.013916
useful,0.012275



Label: POS


Unnamed: 0,POS
good,0.069848
great,0.057939
easy,0.02463
new,0.022213
interesting,0.019366
excellent,0.019109
helpful,0.016878
useful,0.015135
amazing,0.014455
well,0.01435


## ADV

In [None]:
import gensim

# create empty dictionary from gensim library (for the vocabulary)
adv_dictionary = gensim.corpora.Dictionary()

# iteratively add each doc to the bag-of-words corpus
adv_bow_corpus = [adv_dictionary.doc2bow(doc, allow_update=True) for doc in adv_docs]

print(f'number of texts: {len(adv_bow_corpus)}')
print(f'number of words: {len(adv_dictionary)}')

# what is inside? Print first two documents from the new corpus
count = 0
for doc in adv_bow_corpus:
  if count < 2:
    print([[adv_dictionary[id], freq] for id, freq in doc])
    count += 1
  else:
      break

number of texts: 19461
number of words: 1018
[]
[]


In [None]:
import numpy as np
import pandas as pd

# Get vocab from dictionary
pos_vocab = pd.Series(adv_dictionary.token2id).index

print(f'how many words: {len(pos_vocab)}')

# Sequence of numbers as long as the documents
num_docs_pos = len(adv_bow_corpus)
index_pos = range(num_docs_pos)
print(f'how many documents: {num_docs_pos}')

# Create a pandas dataframe full of zeros.
pos_bow_df = pd.DataFrame(data=np.zeros((num_docs_pos, len(adv_dictionary)), dtype=np.float16),
                  index=index_pos,
                  columns=pos_vocab)

# Add each word from the bag-of-words corpus to the matrix.
for idx in index_pos:
  for id, freq in adv_bow_corpus[idx]:
    pos_bow_df.loc[idx, adv_dictionary[id]] = freq

# Display the DataFrame head
display(pos_bow_df.head())

how many words: 1018
how many documents: 19461


  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,ideally,overall,pretty,soon,usually,instead,surely,currently,forward,honestly,...,minimally,monotonously,unforutanately,free,centrally,soooo,patronizingly,left,entertainingly,fore
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
adv_tf_df = pos_bow_df.div(pos_bow_df.sum(axis='columns'), axis='rows')

display(adv_tf_df.head())

  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,ideally,overall,pretty,soon,usually,instead,surely,currently,forward,honestly,...,minimally,monotonously,unforutanately,free,centrally,soooo,patronizingly,left,entertainingly,fore
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,0.199951,0.199951,0.199951,0.199951,0.199951,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,,,,,,,,,,...,,,,,,,,,,
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import pandas as pd

# Add the 'label' column to the pos_tf_df DataFrame
adv_tf_df['label'] = df['label']

# Calculate the mean TF value for each word per label
mean_tf_by_label = adv_tf_df.groupby('label').mean()

# Get top 20 words for each label based on means
top_words_by_label = {} # dictionary
for label_value in mean_tf_by_label.index: # go through mean values
    # Sort in descending order and select the top 20, excluding the 'label' column itself if it were numerical
    top_words = mean_tf_by_label.loc[label_value].drop('label', errors='ignore').sort_values(ascending=False).head(20)
    top_words_by_label[label_value] = top_words # send to dictionary

# Show words by label in a pretty way
print("Top 20 words ADV for each label based on mean term frequency:")
for label_value, top_words in top_words_by_label.items():
    print(f"\nLabel: {label_value}")
    display(top_words)

Top 20 words ADV for each label based on mean term frequency:

Label: NEG


Unnamed: 0,NEG
instead,0.044345
actually,0.040278
extremely,0.031449
far,0.027802
poorly,0.024089
especially,0.020802
completely,0.020196
well,0.019795
unfortunately,0.01838
maybe,0.017189



Label: POS


Unnamed: 0,POS
highly,0.056473
definitely,0.0452
especially,0.039684
overall,0.037755
forward,0.033795
well,0.030223
actually,0.028544
far,0.020553
extremely,0.020416
clearly,0.018044


## Verb

In [None]:
import gensim

# create empty dictionary from gensim library (for the vocabulary)
verb_dictionary = gensim.corpora.Dictionary()

# iteratively add each doc to the bag-of-words corpus
verb_bow_corpus = [verb_dictionary.doc2bow(doc, allow_update=True) for doc in verb_docs]

print(f'number of texts: {len(verb_bow_corpus)}')
print(f'number of words: {len(verb_dictionary)}')

# what is inside? Print first two documents from the new corpus
count = 0
for doc in verb_bow_corpus:
  if count < 2:
    print([[verb_dictionary[id], freq] for id, freq in doc])
    count += 1
  else:
      break

number of texts: 19461
number of words: 3274
[['cover', 1], ['give', 1], ['help', 1], ['learn', 1], ['research', 1], ['understand', 2]]
[['give', 1], ['help', 1], ['adjust', 1], ['focus', 1], ['lay', 1], ['navigate', 1], ['need', 1], ['provide', 1], ['work', 1]]


In [None]:
import numpy as np
import pandas as pd

# Get vocab from dictionary
pos_vocab = pd.Series(verb_dictionary.token2id).index

print(f'how many words: {len(pos_vocab)}')

# Sequence of numbers as long as the documents
num_docs_pos = len(verb_bow_corpus)
index_pos = range(num_docs_pos)
print(f'how many documents: {num_docs_pos}')

# Create a pandas dataframe full of zeros.
pos_bow_df = pd.DataFrame(data=np.zeros((num_docs_pos, len(verb_dictionary)), dtype=np.float16),
                  index=index_pos,
                  columns=pos_vocab)

# Add each word from the bag-of-words corpus to the matrix.
for idx in index_pos:
  for id, freq in verb_bow_corpus[idx]:
    pos_bow_df.loc[idx, verb_dictionary[id]] = freq

# Display the DataFrame head
display(pos_bow_df.head())

how many words: 3274
how many documents: 19461


  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,cover,give,help,learn,research,understand,adjust,focus,lay,navigate,...,flex,snip,submitwarne,recommender,encode,degrade,victimize,rear,oopsy,showboat
0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
verb_tf_df = pos_bow_df.div(pos_bow_df.sum(axis='columns'), axis='rows')

display(verb_tf_df.head())

  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,cover,give,help,learn,research,understand,adjust,focus,lay,navigate,...,flex,snip,submitwarne,recommender,encode,degrade,victimize,rear,oopsy,showboat
0,0.142822,0.142822,0.142822,0.142822,0.142822,0.285645,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.111084,0.111084,0.0,0.0,0.0,0.111084,0.111084,0.111084,0.111084,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.090881,0.0,0.0,0.090881,0.0,0.0,0.0,0.0,0.090881,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.099976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import pandas as pd

# Add the 'label' column to the pos_tf_df DataFrame
verb_tf_df['label'] = df['label']

# Calculate the mean TF value for each word per label
mean_tf_by_label = verb_tf_df.groupby('label').mean()

# Get top 20 words for each label based on means
top_words_by_label = {} # dictionary
for label_value in mean_tf_by_label.index: # go through mean values
    # Sort in descending order and select the top 20, excluding the 'label' column itself if it were numerical
    top_words = mean_tf_by_label.loc[label_value].drop('label', errors='ignore').sort_values(ascending=False).head(20)
    top_words_by_label[label_value] = top_words # send to dictionary

# Show words by label in a pretty way
print("Top 20 words VERB for each label based on mean term frequency:")
for label_value, top_words in top_words_by_label.items():
    print(f"\nLabel: {label_value}")
    display(top_words)

Top 20 words VERB for each label based on mean term frequency:

Label: NEG


Unnamed: 0,NEG
learn,0.038652
need,0.019171
find,0.018598
take,0.018184
teach,0.017765
understand,0.017583
feel,0.017423
explain,0.016808
think,0.015497
complete,0.01533



Label: POS


Unnamed: 0,POS
learn,0.070004
understand,0.028945
thank,0.027318
help,0.024015
recommend,0.023948
give,0.020428
take,0.019485
think,0.019127
explain,0.016336
provide,0.016252
