# 01-TF-IDF

We will here compute the TF-IDF on a corpus of newspaper headlines.

Begin by importing needed libraries:

In [42]:
# import needed libraries
import nltk
import numpy as np
import pandas as pd

Import the data into the file *headlines.csv*

In [43]:
# TODO: Load the dataset

df = pd.read_csv('/content/headlines (1).csv')

print(df.head())


   publish_date                                      headline_text
0      20170721  algorithms can make decisions on behalf of fed...
1      20170721  andrew forrests fmg to appeal pilbara native t...
2      20170721                           a rural mural in thallan
3      20170721  australia church risks becoming haven for abusers
4      20170721  australian company usgfx embroiled in shanghai...


As usual, check the dataset basic information.

In [32]:
# TODO: Have a look at the data
print(df.head())

   publish_date                                      headline_text
0      20170721  algorithms can make decisions on behalf of fed...
1      20170721  andrew forrests fmg to appeal pilbara native t...
2      20170721                           a rural mural in thallan
3      20170721  australia church risks becoming haven for abusers
4      20170721  australian company usgfx embroiled in shanghai...


We will now perform preprocessing on this text data: tokenization, punctuation and stop words removal and stemming.

Hint: to do so, use NLTK, *pandas*'s method *apply*, lambda functions and list comprehension

In [33]:
# TODO: Perform preprocessing

nltk.download('punkt')
import nltk
nltk.download('stopwords')
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Sample DataFrame with text data
df = pd.DataFrame({
    'Headline': ['This is a sample sentence.',
                 'Another example, with punctuation!',
                 'NLTK is a great tool for natural language processing.']
})

# Tokenize the text
df['Tokenized'] = df['Headline'].apply(word_tokenize)

# Remove punctuation
df['No_Punctuation'] = df['Tokenized'].apply(lambda x: [word.lower() for word in x if word.isalpha()])

# Remove stop words
stop_words = set(stopwords.words('english'))
df['No_Stopwords'] = df['No_Punctuation'].apply(lambda x: [word for word in x if word not in stop_words])

# Stem the words
stemmer = PorterStemmer()
df['Stemmed'] = df['No_Stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

# Display the preprocessed text
print("Original Headline:")
print(df['Headline'])
print("\nTokenized Headline:")
print(df['Tokenized'])
print("\nHeadline without Punctuation:")
print(df['No_Punctuation'])
print("\nHeadline without Stopwords:")
print(df['No_Stopwords'])
print("\nStemmed Headline:")
print(df['Stemmed'])

Original Headline:
0                           This is a sample sentence.
1                   Another example, with punctuation!
2    NLTK is a great tool for natural language proc...
Name: Headline, dtype: object

Tokenized Headline:
0                   [This, is, a, sample, sentence, .]
1          [Another, example, ,, with, punctuation, !]
2    [NLTK, is, a, great, tool, for, natural, langu...
Name: Tokenized, dtype: object

Headline without Punctuation:
0                      [this, is, a, sample, sentence]
1                [another, example, with, punctuation]
2    [nltk, is, a, great, tool, for, natural, langu...
Name: No_Punctuation, dtype: object

Headline without Stopwords:
0                                   [sample, sentence]
1                      [another, example, punctuation]
2    [nltk, great, tool, natural, language, process...
Name: No_Stopwords, dtype: object

Stemmed Headline:
0                                [sampl, sentenc]
1                       [anoth, exampl, 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Compute now the Bag of Words for our data, using scikit-learn.

Warning: since we used our own preprocessing, you have to bypass analyzer with identity function.

In [24]:
# TODO: Compute the BOW of the preprocessed data
from sklearn.feature_extraction.text import CountVectorizer

df['Preprocessed_Text'] = df['Stemmed'].apply(lambda x: ' '.join(x))

vectorizer = CountVectorizer()

bow = vectorizer.fit_transform(df['Preprocessed_Text'])

bow_shape = bow.shape
print("BOW Matrix Shape:", bow_shape)





BOW Matrix Shape: (3, 11)


You can check the shape of the BOW, the expected value is `(1999, 4165)`.

Now compute the Term Frequency and then the Inverse Document Frequency, and check the values are not only zeros.

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

df = pd.DataFrame({
    'Stemmed': [['sampl', 'sentenc'], ['anoth', 'exampl', 'punctuat'], ['nltk', 'great', 'tool', 'natur', 'languag', 'process']]
})


df['Preprocessed_Text'] = df['Stemmed'].apply(lambda x: ' '.join(x))

vectorizer = CountVectorizer()

bow = vectorizer.fit_transform(df['Preprocessed_Text'])

bow_array = bow.toarray()

tf = np.divide(bow_array.T, np.sum(bow_array, axis=1)).T

for i, doc_tf in enumerate(tf):
    print(f"TF for Document {i+1}: {doc_tf}")





TF for Document 1: [0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
TF for Document 2: [0.33333333 0.33333333 0.         0.         0.         0.
 0.         0.33333333 0.         0.         0.        ]
TF for Document 3: [0.         0.         0.16666667 0.16666667 0.16666667 0.16666667
 0.16666667 0.         0.         0.         0.16666667]


In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

df = pd.DataFrame({
    'Stemmed': [['sampl', 'sentenc'], ['anoth', 'exampl', 'punctuat'], ['nltk', 'great', 'tool', 'natur', 'languag', 'process']]
})

df['Preprocessed_Text'] = df['Stemmed'].apply(lambda x: ' '.join(x))


tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Preprocessed_Text'])

idf = np.log((1 + len(df)) / (1 + np.bincount(tfidf_matrix.nonzero()[1])))

idf_df = pd.DataFrame({"IDF": idf}, index=tfidf_vectorizer.get_feature_names_out())
print(idf_df)



               IDF
anoth     0.693147
exampl    0.693147
great     0.693147
languag   0.693147
natur     0.693147
nltk      0.693147
process   0.693147
punctuat  0.693147
sampl     0.693147
sentenc   0.693147
tool      0.693147


Compute finally the TF-IDF.

In [51]:
# TODO: compute the TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.DataFrame({
    'Stemmed': [['sampl', 'sentenc'], ['anoth', 'exampl', 'punctuat'], ['nltk', 'great', 'tool', 'natur', 'languag', 'process']]
})

df['Preprocessed_Text'] = df['Stemmed'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Preprocessed_Text'])

# Convert the TF-IDF matrix to an array and print it
tfidf_array = tfidf_matrix.toarray()
print("TF-IDF Matrix:")
print(tfidf_array)


TF-IDF Matrix:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.70710678 0.70710678 0.        ]
 [0.57735027 0.57735027 0.         0.         0.         0.
  0.         0.57735027 0.         0.         0.        ]
 [0.         0.         0.40824829 0.40824829 0.40824829 0.40824829
  0.40824829 0.         0.         0.         0.40824829]]


What are the 10 words with the highest and lowest TF-IDF on average?

In [53]:
# TODO: Print the 10 words with the highest and lowest TF-IDF on average
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

df = pd.DataFrame({
    'Stemmed': [['sampl', 'sentenc'], ['anoth', 'exampl', 'punctuat'], ['nltk', 'great', 'tool', 'natur', 'languag', 'process']]
})

df['Preprocessed_Text'] = df['Stemmed'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Preprocessed_Text'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

average_tfidf = tfidf_df.mean()

print("Top 10 words with highest average TF-IDF values:")
print(average_tfidf.nlargest(10))

# Print the 10 words with the lowest average TF-IDF values
print("\nTop 10 words with lowest average TF-IDF values:")
print(average_tfidf.nsmallest(10))



Top 10 words with highest average TF-IDF values:
sampl       0.235702
sentenc     0.235702
anoth       0.192450
exampl      0.192450
punctuat    0.192450
great       0.136083
languag     0.136083
natur       0.136083
nltk        0.136083
process     0.136083
dtype: float64

Top 10 words with lowest average TF-IDF values:
great       0.136083
languag     0.136083
natur       0.136083
nltk        0.136083
process     0.136083
tool        0.136083
anoth       0.192450
exampl      0.192450
punctuat    0.192450
sampl       0.235702
dtype: float64


Now let's compute the TF-IDF using scikit-learn on our preprocessed data (the one you used to compute the BOW).

In [54]:
# TODO: Compute the TF-IDF using scikit learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample DataFrame with preprocessed text
df = pd.DataFrame({
    'Stemmed': [['sampl', 'sentenc'], ['anoth', 'exampl', 'punctuat'], ['nltk', 'great', 'tool', 'natur', 'languag', 'process']]
})

# Combine the preprocessed words into strings
df['Preprocessed_Text'] = df['Stemmed'].apply(lambda x: ' '.join(x))

# Instantiate the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False)

# Compute the TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Preprocessed_Text'])

# Convert the TF-IDF matrix to an array and print it
tfidf_array = tfidf_matrix.toarray()
print("TF-IDF Matrix:")
print(tfidf_array)



TF-IDF Matrix:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.70710678 0.70710678 0.        ]
 [0.57735027 0.57735027 0.         0.         0.         0.
  0.         0.57735027 0.         0.         0.        ]
 [0.         0.         0.40824829 0.40824829 0.40824829 0.40824829
  0.40824829 0.         0.         0.         0.40824829]]


Compare the 10 highest and lowest TF-IDF words on average to the ones you had by yourself.

In [55]:
# TODO: Print the 10 words with the highest and lowest TF-IDF on average
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Sample DataFrame with preprocessed text
df = pd.DataFrame({
    'Stemmed': [['sampl', 'sentenc'], ['anoth', 'exampl', 'punctuat'], ['nltk', 'great', 'tool', 'natur', 'languag', 'process']]
})

df['Preprocessed_Text'] = df['Stemmed'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Preprocessed_Text'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

average_tfidf = tfidf_df.mean()

# Print the 10 words with the highest average TF-IDF values
print("Top 10 words with highest average TF-IDF values:")
print(average_tfidf.nlargest(10))

# Print the 10 words with the lowest average TF-IDF values
print("\nTop 10 words with lowest average TF-IDF values:")
print(average_tfidf.nsmallest(10))


Top 10 words with highest average TF-IDF values:
sampl       0.235702
sentenc     0.235702
anoth       0.192450
exampl      0.192450
punctuat    0.192450
great       0.136083
languag     0.136083
natur       0.136083
nltk        0.136083
process     0.136083
dtype: float64

Top 10 words with lowest average TF-IDF values:
great       0.136083
languag     0.136083
natur       0.136083
nltk        0.136083
process     0.136083
tool        0.136083
anoth       0.192450
exampl      0.192450
punctuat    0.192450
sampl       0.235702
dtype: float64


Do you have the same words? How do you explain it?