In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Exercise 1: Compute TF-IDF Scores

# Sample dataset
documents = [
    "Python is a great programming language for data science.",
    "Data science and machine learning are popular fields.",
    "Machine learning relies on mathematics and programming."
]

# Step 1: Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer() # Converts text into numerical values based on TF-IDF.

# Step 2: Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents) #  Learns vocabulary and applies TF-IDF.

# Step 3: Convert the matrix to a DataFrame for better readability
# .toarray() → Converts the sparse matrix to an array.
# get_feature_names_out() → Gets the words used in TF-IDF calculation.
df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the matrix
print(df)

        and       are      data    fields       for     great        is  \
0  0.000000  0.000000  0.293048  0.000000  0.385323  0.385323  0.385323   
1  0.313316  0.411973  0.313316  0.411973  0.000000  0.000000  0.000000   
2  0.329928  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

   language  learning   machine  mathematics        on   popular  programming  \
0  0.385323  0.000000  0.000000     0.000000  0.000000  0.000000     0.293048   
1  0.000000  0.313316  0.313316     0.000000  0.000000  0.411973     0.000000   
2  0.000000  0.329928  0.329928     0.433816  0.433816  0.000000     0.329928   

     python    relies   science  
0  0.385323  0.000000  0.293048  
1  0.000000  0.000000  0.313316  
2  0.000000  0.433816  0.000000  


Questions: 
- Which term has the highest TF-IDF score in each document? 
- Are there any terms with a score of 0? Why?

In [2]:
# Find the term with the highest TF-IDF score in each document
highest_terms = df.idxmax(axis=1)  # Finds the term with the max value in each row
highest_scores = df.max(axis=1)  # Gets the max score for each document

# Display results
for i, (term, score) in enumerate(zip(highest_terms, highest_scores)):
    print(f"Document {i+1}: '{term}' has the highest TF-IDF score of {score:.4f}")

Document 1: 'for' has the highest TF-IDF score of 0.3853
Document 2: 'are' has the highest TF-IDF score of 0.4120
Document 3: 'mathematics' has the highest TF-IDF score of 0.4338


In [3]:
# Check if any term has a TF-IDF score of 0 in any document
zero_terms = (df == 0).sum(axis=0)

# Display terms with at least one zero occurrence
print("Terms with a score of 0 in at least one document:\n", zero_terms, "Those terms has TF-IDF score 0 in at least in one documents, it's due to it's absent in one documents but present in corpus.")

Terms with a score of 0 in at least one document:
 and            1
are            2
data           1
fields         2
for            2
great          2
is             2
language       2
learning       1
machine        1
mathematics    2
on             2
popular        2
programming    1
python         2
relies         2
science        1
dtype: int64 Those terms has TF-IDF score 0 in at least in one documents, it's due to it's absent in one documents but present in corpus.


In [4]:
# Exercise 2: Customize the TF-IDF Vectorizer 

# 1. Modify the `TfidfVectorizer` to remove common English stop words. 
# 2. Use the parameter `max_features=10` to limit the matrix to the top 10 terms.

# Initialize TF-IDF Vectorizer with stop words removal and feature limit
vectorizer = TfidfVectorizer(stop_words='english', max_features=10)

# Fit and transform
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert to DataFrame
df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display updated TF-IDF matrix
print(df)

       data    fields    great  language  learning   machine  mathematics  \
0  0.393511  0.000000  0.51742   0.51742  0.000000  0.000000     0.000000   
1  0.366180  0.481482  0.00000   0.00000  0.366180  0.366180     0.000000   
2  0.000000  0.000000  0.00000   0.00000  0.459854  0.459854     0.604652   

    popular  programming   science  
0  0.000000     0.393511  0.393511  
1  0.481482     0.000000  0.366180  
2  0.000000     0.459854  0.000000  


In [5]:
#  Identify the top 3 terms across the corpus with the highest importance.

# Sum TF-IDF scores for each term across all documents
term_importance = df.sum().sort_values(ascending=False)

# Get the top 3 most important terms
top_3_terms = term_importance.head(3)

# Display the top 3 terms
print(f"Top 3 Most Important Terms in the Corpus:\n{top_3_terms}")

Top 3 Most Important Terms in the Corpus:
programming    0.853365
machine        0.826033
learning       0.826033
dtype: float64


In [8]:
# # Exercise 3: Term Importance Analysis 

# Sum TF-IDF scores across all documents 
term_importance = tfidf_matrix.sum(axis=0).A1  # Convert sparse matrix to array, A1=Flattens the 2D array into 1D, axis=0: Operate along columns (sum downwards across rows).
terms = vectorizer.get_feature_names_out()  # Ensure correct variable name

# Create a DataFrame of terms and their importance 
importance_df = pd.DataFrame({"Term": terms, "Importance": term_importance}) 

# Sort terms by importance in descending order
importance_df = importance_df.sort_values(by="Importance", ascending=False) 

# Display the DataFrame
print(importance_df)

          Term  Importance
8  programming    0.853365
5      machine    0.826033
4     learning    0.826033
0         data    0.759691
9      science    0.759691
6  mathematics    0.604652
2        great    0.517420
3     language    0.517420
1       fields    0.481482
7      popular    0.481482


In [9]:
# Questions: - Which terms are the most important in the corpus? 
#            - How do these terms relate to the overall content?
print(f" The most important terms in the corpus are:\n{importance_df.head(5)}")
print("These terms are appears on multiple documents but are not too common across all documents making them strong indicators of the topic.")

 The most important terms in the corpus are:
          Term  Importance
8  programming    0.853365
5      machine    0.826033
4     learning    0.826033
0         data    0.759691
9      science    0.759691
These terms are appears on multiple documents but are not too common across all documents making them strong indicators of the topic.


In [21]:
# Exercise 4: Apply TF-IDF to Your Own Dataset
new_documents = [
    "Self-driving cars rely on deep learning and sensor fusion for navigation.",
    "Big data analytics helps businesses make data-driven decisions efficiently.",
    "Robotics and automation are transforming industries with AI-powered solutions."
]
# Apply TF-IDF
new_vectorizer = TfidfVectorizer(stop_words='english')
new_tfidf_matrix = new_vectorizer.fit_transform(new_documents)

# Sum TF-IDF scores across all documents 
new_term_importance = new_tfidf_matrix.sum(axis=0).A1
new_terms = new_vectorizer.get_feature_names_out()

# Create a DataFrame of terms and their importance, Sort terms by importance in descending order
new_importance_df = pd.DataFrame({"Term": new_terms, "Importance": new_term_importance}).sort_values(by="Importance", ascending=False).reset_index(drop= True) 

# Display the DataFrame
print(f"Most important terms:\n\n{new_importance_df.head(5)}")

Most important terms:

           Term  Importance
0          data    0.577350
1            ai    0.377964
2    automation    0.377964
3     solutions    0.377964
4  transforming    0.377964
