## TF-IDF

Term Frequency(TF) = [number of times word appeared / total no of words in a document]
Inverse Document Frequency(IDF) = [log(Total number of documents / number of documents that contains the word)]
TF-IDF = Term Frequency(TF) * Inverse Document Frequency(IDF)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [3]:
v= TfidfVectorizer()

transformed_output = v.fit_transform(corpus)
print(v.vocabulary_)
print(transformed_output)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}
  (0, 0)	0.2426654728284301
  (0, 7)	0.2426654728284301
  (0, 15)	0.2426654728284301
  (0, 16)	0.11527032701364152
  (0, 17)	0.2426654728284301
  (0, 22)	0.7279964184852903
  (0, 10)	0.40286636477562926
  (0, 25)	0.2426654728284301
  (1, 26)	0.30652086071532464
  (1, 14)	0.5680354003049032
  (1, 20)	0.30652086071532464
  (1, 4)	0.30652086071532464
  (1, 5)	0.5680354003049032
  (1, 16)	0.26982671076064085
  (2, 19)	0.5680354003049032
  (2, 24)	0.5680354003049032
  (2, 26)	0.30652086071532464
  (2, 20)	0.30652086071532464
  (2, 4)	0.30652086071532464
  (2, 16)	0.26982671076064085
  (3, 21)	0.5680354003049032
  (3, 12)	0.5680354003049032

In [4]:
feature_names = v.get_feature_names_out()
for feature in feature_names:
  ind= v.vocabulary_.get(feature)
  print(f"{feature} : {v.idf_[ind]}")

already : 2.386294361119891
am : 2.386294361119891
amazon : 2.386294361119891
and : 2.386294361119891
announcing : 1.2876820724517808
apple : 2.386294361119891
are : 2.386294361119891
ate : 2.386294361119891
biryani : 2.386294361119891
dot : 2.386294361119891
eating : 1.9808292530117262
eco : 2.386294361119891
google : 2.386294361119891
grapes : 2.386294361119891
iphone : 2.386294361119891
ironman : 2.386294361119891
is : 1.1335313926245225
loki : 2.386294361119891
microsoft : 2.386294361119891
model : 2.386294361119891
new : 1.2876820724517808
pixel : 2.386294361119891
pizza : 2.386294361119891
surface : 2.386294361119891
tesla : 2.386294361119891
thor : 2.386294361119891
tomorrow : 1.2876820724517808
you : 2.386294361119891


In [5]:
print(corpus[:2])
print(transformed_output.toarray()[:2])

['Thor eating pizza, Loki is eating pizza, Ironman ate pizza already', 'Apple is announcing new iphone tomorrow']
[[0.24266547 0.         0.         0.         0.         0.
  0.         0.24266547 0.         0.         0.40286636 0.
  0.         0.         0.         0.24266547 0.11527033 0.24266547
  0.         0.         0.         0.         0.72799642 0.
  0.         0.24266547 0.         0.        ]
 [0.         0.         0.         0.         0.30652086 0.5680354
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5680354  0.         0.26982671 0.
  0.         0.         0.30652086 0.         0.         0.
  0.         0.         0.30652086 0.        ]]


In [6]:
import pandas  as pd

In [7]:
df= pd.read_csv('https://raw.githubusercontent.com/codebasics/nlp-tutorials/main/12_tf_idf/Ecommerce_data.csv')

df['label'].value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [8]:
df['label_no']= df['label'].map({'Household':0, 'Books':1, 'Electronics':2, 'Clothing & Accessories':3})

df.head(5)

Unnamed: 0,Text,label,label_no
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df.label_no, test_size=0.2, stratify= df.label_no)

Knn classifier

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

pipeknn= Pipeline(
    [
        ('vectorizer_tfidf', TfidfVectorizer()),
        ('classifier', KNeighborsClassifier())
    ]
)

pipeknn.fit(x_train, y_train)

y_pred = pipeknn.predict(x_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9652083333333333
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      1200
           1       0.97      0.96      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.98      0.97      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



Random forest classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

pipe_rf= Pipeline(
    [
        ('vectorizer_tfidf', TfidfVectorizer()),
        ('classifier', RandomForestClassifier())
    ]
)

pipe_rf.fit(x_train, y_train)

y_pred = pipe_rf.predict(x_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9725
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.97      0.98      0.98      1200
           2       0.98      0.96      0.97      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



GradientBoosted trees

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

pipe_xgb= Pipeline(
    [
        ('vectorizer_tfidf', TfidfVectorizer()),
        ('classifier', GradientBoostingClassifier())
    ]
)

pipe_xgb.fit(x_train, y_train)

y_pred = pipe_xgb.predict(x_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9397916666666667
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1200
           1       0.92      0.95      0.93      1200
           2       0.96      0.93      0.94      1200
           3       0.95      0.96      0.96      1200

    accuracy                           0.94      4800
   macro avg       0.94      0.94      0.94      4800
weighted avg       0.94      0.94      0.94      4800



Multinomial Naive Bayes

In [30]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Multi NB', MultinomialNB())
])

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9610416666666667
              precision    recall  f1-score   support

           0       0.93      0.96      0.95      1200
           1       0.98      0.94      0.96      1200
           2       0.97      0.96      0.96      1200
           3       0.97      0.98      0.98      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



Results using Pre-process text
Remove stop words and apply stemming + lemmatization

In [11]:
import spacy

nlp = spacy.load('en_core_web_sm')

result= []
def preprocess(text):
  text= nlp(text)
  for token in text:
    if not token.is_stop and not token.is_punct:
        result.append(token.lemma_)

  return " ".join(result)

In [None]:
df['preprocessed_text']= df['Text'].apply(preprocess)
df.head()

apply gradientboosted on preprocessed text

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['label_no'], test_split= 0.2, stratify= df.label_no)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

pipe = Pipeline([
    ('vectorized_tfidf', TfidfVectorizer()),
    ('classifier', GradientBoostingClassifier())
])

pipe.fit(x_train, y_train)

y_pred = pipe.predict(x_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))