<a href="https://colab.research.google.com/github/PALBIBEK/Bengali.AI-Handwritten-Grapheme-Classification/blob/main/TF_IDf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Assignment 1: Compute TF-IDF Matrix Manually
Objective: Learn how to compute the TF-IDF matrix from scratch.
Task:
Create a small corpus of at least 5 documents with varying lengths.
Calculate the term frequency (TF) for each term in each document.
Compute the inverse document frequency (IDF) for each term.
Calculate the TF-IDF for each term in each document using the formula:
TF-IDF
(
𝑡
,
𝑑
)
=
TF
(
𝑡
,
𝑑
)
×
IDF
(
𝑡
)
TF-IDF(t,d)=TF(t,d)×IDF(t)
Display the resulting TF-IDF matrix.**

In [None]:
from functools import reduce
import numpy as np
import pandas as pd

def TF(term , document):
  words=document.lower().split()
  # count_of_term=len(reduce(lambda x,i: x + [words[i]] if words[i]==term else x,range(0,len(words)),[]))
  count_of_term=len(list(filter(lambda x: x==term, words)))
  return count_of_term/len(words)

corpus = [
    "this the the first document",
    "this document is the second document",
    "and this is the third one",
    "is this the first document",
]
all_tokens=set()
for doc in corpus:
  all_tokens.update(doc.split())
all_tokens= sorted(all_tokens,key=lambda x: x.lower())
TF_IDF_matrix=[ [ 0 for _ in corpus] for _ in all_tokens]
for i, word in enumerate(all_tokens):
  for j,doc in enumerate(corpus):
    TF_IDF_matrix[i][j]=TF(word,doc)

frequency=[0 for _ in all_tokens]

for i,word in enumerate(all_tokens):
  for doc in corpus:
    words=doc.lower().split()
    if word in words:
      frequency[i]+=1

# Calculate IDF for each word
for i, word in enumerate(all_tokens):
    frequency[i] = np.log(len(corpus) / (1 + frequency[i]))
    frequency[i]=round(frequency[i],3)
for i,word in enumerate(all_tokens):
  for j,doc in enumerate(corpus):
    TF_IDF_matrix[i][j]=TF_IDF_matrix[i][j]*frequency[i]

TF_IDF_matrix
df=pd.DataFrame(data=TF_IDF_matrix,columns=[f"doc{i+1}"for i in range(len(corpus))])
print(df)

     doc1      doc2      doc3    doc4
0  0.0000  0.000000  0.115500  0.0000
1  0.0000  0.000000  0.000000  0.0000
2  0.0576  0.000000  0.000000  0.0576
3  0.0000  0.000000  0.000000  0.0000
4  0.0000  0.000000  0.115500  0.0000
5  0.0000  0.115500  0.000000  0.0000
6 -0.0892 -0.037167 -0.037167 -0.0446
7  0.0000  0.000000  0.115500  0.0000
8 -0.0446 -0.037167 -0.037167 -0.0446


**Assignment 2: Using Scikit-Learn to Generate TF-IDF Matrix
Objective: Use Scikit-Learn's TfidfVectorizer to create a TF-IDF matrix.
Task:
Install the necessary libraries: scikit-learn.
Load a dataset of text documents, such as a collection of news articles or reviews.
Use TfidfVectorizer to convert the collection of text documents to a matrix of TF-IDF features.
Print the shape of the resulting matrix and display the first few rows.**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Example corpus
corpus = [
    "this is the first document",
    "this document is the second document",
    "and this is the third one",
    "is this the first document",
]
Vectorizer=TfidfVectorizer()
X=Vectorizer.fit_transform(corpus)

df=pd.DataFrame(data=X.toarray(),columns=Vectorizer.get_feature_names_out())
print(df)

        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


**Assignment 3: Feature Selection Using TF-IDF
Objective: Use TF-IDF for feature selection in a text classification task.
Task:
Load a labeled text dataset (e.g., spam vs. ham emails).
Split the dataset into training and testing sets.
Use TfidfVectorizer to transform the text data into TF-IDF features.
Train a classification model (e.g., logistic regression, SVM) using the TF-IDF features.
Evaluate the model's performance on the testing set.
Analyze the importance of features (terms) in the model by examining the coefficients or feature importance scores.**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups

data=fetch_20newsgroups()
X=data.data
y=data.target

vectorizer=TfidfVectorizer()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.7,random_state=42)


X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

model=LogisticRegression()
model.fit(X_train,y_train)


In [None]:
y_predict=model.predict(X_test)
print("Classification reort\n:",classification_report(y_test,y_predict))

Classification reort
:               precision    recall  f1-score   support

           0       0.91      0.81      0.86       338
           1       0.68      0.79      0.73       401
           2       0.78      0.79      0.78       405
           3       0.75      0.75      0.75       418
           4       0.86      0.76      0.81       410
           5       0.78      0.86      0.81       404
           6       0.58      0.88      0.70       389
           7       0.90      0.83      0.86       431
           8       0.90      0.90      0.90       423
           9       0.82      0.95      0.88       395
          10       0.95      0.94      0.94       403
          11       0.97      0.91      0.94       429
          12       0.88      0.69      0.77       423
          13       0.89      0.89      0.89       413
          14       0.93      0.87      0.90       431
          15       0.73      0.92      0.81       422
          16       0.90      0.87      0.88       398
    

In [None]:
feature_names=vectorizer.get_feature_names_out()
features_coef=list(zip(model.coef_[0],feature_names))

In [None]:
features_coef_sorted=sorted(features_coef,key=lambda x: abs(x[0]))[-10:]
print("Top 10 most important features:")
for f_c in features_coef_sorted:
  print(f"{f_c[1]}: {f_c[0]}")

Top 10 most important features:
that: 1.5803264130112498
sgi: 1.680745898223057
islam: 1.749634526740061
islamic: 1.7581761289974793
livesey: 1.768244265495613
is: 1.8116080485735389
caltech: 1.9970584137394265
atheism: 2.1948455907782005
god: 2.3629577901778505
keith: 3.11031855714001


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import numpy as np

# Example documents
documents = [
    "Machine learning is fun.",
    "Machine learning is fun. Machine learning algorithms are important. Machine learning can solve many problems. Machine learning is everywhere."
]

# Step 1: Compute the TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Step 2: Convert the sparse matrix to a dense matrix (optional, for easier understanding)
tfidf_dense = tfidf_matrix.toarray()

# Step 3: Normalize the TF-IDF matrix
normalized_tfidf = normalize(tfidf_dense, norm='l2')

# Print the results
print("TF-IDF Matrix:")
print(tfidf_dense)
print("\nNormalized TF-IDF Matrix:")
print(normalized_tfidf)


TF-IDF Matrix:
[[0.         0.         0.         0.         0.5        0.
  0.5        0.5        0.5        0.         0.         0.        ]
 [0.1934159  0.1934159  0.1934159  0.1934159  0.13761701 0.1934159
  0.27523402 0.55046803 0.55046803 0.1934159  0.1934159  0.1934159 ]]

Normalized TF-IDF Matrix:
[[0.         0.         0.         0.         0.5        0.
  0.5        0.5        0.5        0.         0.         0.        ]
 [0.1934159  0.1934159  0.1934159  0.1934159  0.13761701 0.1934159
  0.27523402 0.55046803 0.55046803 0.1934159  0.1934159  0.1934159 ]]
