In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [55]:
tv = TfidfVectorizer()
tf_ouput = tv.fit_transform(corpus)

In [56]:
tv.get_feature_names_out()

array(['already', 'am', 'amazon', 'and', 'announcing', 'apple', 'are',
       'ate', 'biryani', 'dot', 'eating', 'eco', 'google', 'grapes',
       'iphone', 'ironman', 'is', 'loki', 'microsoft', 'model', 'new',
       'pixel', 'pizza', 'surface', 'tesla', 'thor', 'tomorrow', 'you'],
      dtype=object)

In [57]:
# printing the words and their idf-score.
feature_names = tv.get_feature_names_out()
for word in feature_names:
    indx = tv.vocabulary_.get(word)
    print(f"{word} {tv.idf_[indx]}")

already 2.386294361119891
am 2.386294361119891
amazon 2.386294361119891
and 2.386294361119891
announcing 1.2876820724517808
apple 2.386294361119891
are 2.386294361119891
ate 2.386294361119891
biryani 2.386294361119891
dot 2.386294361119891
eating 1.9808292530117262
eco 2.386294361119891
google 2.386294361119891
grapes 2.386294361119891
iphone 2.386294361119891
ironman 2.386294361119891
is 1.1335313926245225
loki 2.386294361119891
microsoft 2.386294361119891
model 2.386294361119891
new 1.2876820724517808
pixel 2.386294361119891
pizza 2.386294361119891
surface 2.386294361119891
tesla 2.386294361119891
thor 2.386294361119891
tomorrow 1.2876820724517808
you 2.386294361119891


* we can see that the idf of words occuring in most of the documents:- 'is' is lower
as compared to the words occuring rarely in a doc like:- 'microsoft', 'tesla'

In [58]:
tf_ouput.toarray()[0]

array([0.24266547, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.24266547, 0.        , 0.        ,
       0.40286636, 0.        , 0.        , 0.        , 0.        ,
       0.24266547, 0.11527033, 0.24266547, 0.        , 0.        ,
       0.        , 0.        , 0.72799642, 0.        , 0.        ,
       0.24266547, 0.        , 0.        ])

# Ecommerce_Dataset

In [59]:
import pandas as pd
df = pd.read_csv("ecommerce_data.csv")

In [60]:
df.sample(4)

Unnamed: 0,Text,label
8810,Jompers Men's Cotton Kurta and Pyjama Set JOMP...,Clothing & Accessories
18170,EK UDAAN Unisex Woolen Thermal Wear Thumb Sock...,Clothing & Accessories
20612,Directors' Diaries: The Road to Their First Fi...,Books
22373,JBL C200SI in-Ear Headphones with Mic (Gun Met...,Electronics


In [61]:
df.shape, df.label.value_counts()

((24000, 2),
 label
 Household                 6000
 Electronics               6000
 Clothing & Accessories    6000
 Books                     6000
 Name: count, dtype: int64)

* we can see that the classes are balanced

### let's map the label to numerical values:

In [62]:
map_values = {"Household":0, "Electronics":1,"Clothing & Accessories":2, "Books":3}
df['map_label'] = df['label'].map(map_values)

In [63]:
df.sample(4)

Unnamed: 0,Text,label,map_label
2572,Kiran’s SSC CGL Combined Graduate Level Exams ...,Books,3
17013,Ahmedabad Cotton 144 TC Cotton Bedsheet with 2...,Household,0
17833,boAt Bass Heads 225 in-Ear Headphones with Mic...,Electronics,1
22150,boAt Nirvanaa Uno in-Ear Earphones with Mic (B...,Electronics,1


In [64]:
# Now let's do train-test-split:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(df['Text'],
                                                   df['map_label'],
                                                   test_size=0.2,
                                                   stratify=df['map_label'],
                                                   random_state=2024)

In [65]:
X_train.shape, X_test.shape

((19200,), (4800,))

In [66]:
y_train.value_counts() # equal division of classes due to stratifying them

map_label
3    4800
2    4800
1    4800
0    4800
Name: count, dtype: int64

# Model implementation: using pipeline

In [67]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [68]:
clf = Pipeline([
    ('tf-idf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=10))
])

In [69]:
clf.fit(X_train, y_train)

In [70]:
y_pred= clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1200
           1       0.96      0.96      0.96      1200
           2       0.98      0.98      0.98      1200
           3       0.97      0.95      0.96      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [71]:
X_test[:4]

1758     Numex 58mm close up lens filter kit +1 +2 +4 +...
9790     Decals Design 'Tree with Birds and Cages' Wall...
1500     Lola Dola LDLDoll001(Options) Women Baby Doll ...
16413    Long Walk To Freedom Review A tale of anger an...
Name: Text, dtype: object

In [76]:
y_test[:3]

1758    1
9790    0
1500    2
Name: map_label, dtype: int64

In [77]:
y_pred[:3]

array([1, 0, 2], dtype=int64)

In [87]:
# using multnomial:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [84]:
clf = Pipeline([
    ('tf-idf', TfidfVectorizer()),
    ('knn', MultinomialNB())
])

In [85]:
clf.fit(X_train, y_train)


In [86]:
y_pred= clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1200
           1       0.97      0.96      0.97      1200
           2       0.98      0.98      0.98      1200
           3       0.98      0.95      0.97      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [88]:
# now let's see random forest:
clf = Pipeline([
    ('tf-idf', TfidfVectorizer()),
    ('knn', RandomForestClassifier())
])

In [89]:
clf.fit(X_train, y_train)

In [90]:
y_pred= clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1200
           1       0.99      0.96      0.97      1200
           2       0.98      0.99      0.99      1200
           3       0.97      0.98      0.97      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



# 
* We see that the random forest is performing well for this data.

## Now we'll try after doing some  basic text-pre-processing

In [91]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [92]:
def pre_process(text):
    doc = nlp(text)
    clean_text=[]
    for token in doc:
        if not token.is_stop or not token.is_punct:
            clean_text.append(token.lemma_)
    return " ".join(clean_text)

In [93]:
df['processed_text'] = df['Text'].apply(pre_process)

In [94]:
df['Text'][0]

'Urban Ladder Eisner Low Back Study-Office Computer Chair(Black) A study in simple. The Eisner study chair has a firm foam cushion, which makes long hours at your desk comfortable. The flexible meshed back is designed for air-circulation and support when you lean back. The curved arms provide ergonomic forearm support. Adjust the height using the gas lift to find that comfortable position and the nylon castors make it easy to move around your space. Chrome legs refer to the images for dimension details any assembly required will be done by the UL team at the time of delivery indoor use only.'

In [95]:
df['processed_text'][0]

'Urban Ladder Eisner low Back Study - Office Computer Chair(Black ) a study in simple . the Eisner study chair have a firm foam cushion , which make long hour at your desk comfortable . the flexible mesh back be design for air - circulation and support when you lean back . the curved arm provide ergonomic forearm support . adjust the height use the gas lift to find that comfortable position and the nylon castor make it easy to move around your space . chrome leg refer to the image for dimension detail any assembly require will be do by the UL team at the time of delivery indoor use only .'

In [96]:
# again train-test-split:
# Now let's do train-test-split:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(df['processed_text'],
                                                   df['map_label'],
                                                   test_size=0.2,
                                                   stratify=df['map_label'],
                                                   random_state=2024)

In [98]:
clf = Pipeline([
    ('tf-idf', TfidfVectorizer()),
    ('knn', MultinomialNB())
])


In [99]:
clf.fit(X_train, y_train)

In [100]:
y_pred= clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96      1200
           1       0.97      0.97      0.97      1200
           2       0.99      0.98      0.99      1200
           3       0.99      0.95      0.97      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



# Exercise::

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Emotion_classify_Data.csv")

In [4]:
df.sample(5)

Unnamed: 0,Comment,Emotion
3607,i am jealous of andreas growing belly and the ...,anger
4274,i feel agitated i become easily overwhelmed,fear
403,i also love seeing a star emerge and i feel li...,anger
3575,im being particular but id feel uncomfortable ...,fear
4628,i smoothly hand her a twenty feeling smug that...,joy


In [5]:
df.shape

(5937, 2)

In [8]:
df.Emotion.value_counts() # 3 types of label:- anger, joy fear.

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [9]:
#Add the new column "Emotion_num" which gives a unique number to each of these Emotions
#joy --> 0, fear --> 1, anger --> 2
#checking the results by printing top 5 rows

In [12]:
target_map = {"joy": 0, "fear":1 , "anger":2}
df['Emotion_num'] = df['Emotion'].map(target_map)

In [13]:
df.head(3)

Unnamed: 0,Comment,Emotion,Emotion_num
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,2
2,i sit here to write i start to dig out my feel...,fear,1


# Modelling without Pre-processing Text data

In [14]:
#import train-test split
#Do the 'train-test' splitting with test size of 20%
#Note: Give Random state 2022 and also do the stratify sampling

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(df['Comment'], df['Emotion_num'], test_size=0.2, stratify=df['Emotion_num'], random_state=2024)

In [17]:
X_train.shape, X_test.shape

((4749,), (1188,))

## Attempt 1 :

using the sklearn pipeline module create a classification pipeline to classify the Data.

## Note:

* using CountVectorizer with only trigrams.
* use RandomForest as the classifier.
* print the classification report.

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [25]:
clf = Pipeline([
    ('cv', CountVectorizer(ngram_range=(3,3))),  # only tri-grams
    ('rf', RandomForestClassifier() )
])

In [26]:
clf.fit(X_train, y_train)

In [27]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.22      0.32       400
           1       0.42      0.71      0.52       388
           2       0.50      0.47      0.49       400

    accuracy                           0.46      1188
   macro avg       0.50      0.47      0.44      1188
weighted avg       0.50      0.46      0.44      1188



## Worst performance of Random Forest with 3-gram

# Attempt 2:report.

* using CountVectorizer with both unigram and bigrams.
* use Multinomial Naive Bayes as the classifier.
* print the classification report.

In [28]:
clf = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,2))),  # only tri-grams
    ('rf', MultinomialNB() )
])

In [29]:
clf.fit(X_train, y_train)

In [30]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88       400
           1       0.86      0.87      0.87       388
           2       0.86      0.88      0.87       400

    accuracy                           0.87      1188
   macro avg       0.87      0.87      0.87      1188
weighted avg       0.87      0.87      0.87      1188



* Multinomial naive bayes performed better:

# attempt 3
* using CountVectorizer with both unigram and Bigrams.
* use RandomForest as the classifier.
* print the classification report.

In [33]:
clf = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,2))),  # uni-gram + bi- grams
    ('rf', RandomForestClassifier() )
])

In [35]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.97      0.91       400
           1       0.95      0.87      0.91       388
           2       0.95      0.88      0.91       400

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



## Performace of random forest improved with uni-gram and bi-grams:

# Attempt 4:
* using TF-IDF vectorizer for Pre-processing the text.
* use RandomForest as the classifier.
* print the classification report.

In [36]:
clf = Pipeline([
    ('cv', TfidfVectorizer()),  
    ('rf', RandomForestClassifier())
])

In [37]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.97      0.91       400
           1       0.93      0.89      0.91       388
           2       0.95      0.87      0.91       400

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [38]:
# similar performace: with tf-idf 

## Use text pre-processing to remove stop words, punctuations and apply lemmatization

In [39]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [40]:
def pre_processing(text):
    doc = nlp(text)
    clean_text=list()
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        clean_text.append(token.lemma_)
    return " ".join(clean_text)


In [42]:
# create a new column "preprocessed_comment" and use the utility function above to get the clean data
# this will take some time, please be patient
df['preprocessed_comment'] = df['Comment'].apply(pre_processing)

In [55]:
print(df['preprocessed_comment'][3], '\n')
print(df['Comment'][3])

ve angry r feel like idiot trust place 

ive been really angry with r and i feel like an idiot for trusting him in the first place


#### Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
* Note: Use the preprocessed_Comment

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(df['preprocessed_comment'], df['Emotion_num'], test_size=0.2, stratify=df['Emotion_num'], random_state=2024)

#### Let's check the scores with our best model till now

## Note: attempt 1:
* using CountVectorizer with both unigrams and bigrams.
* use RandomForest as the classifier.
* print the classification report.

In [47]:
clf = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,2))),  
    ('rf', RandomForestClassifier())
]) 

In [48]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       400
           1       0.91      0.90      0.91       388
           2       0.93      0.90      0.92       400

    accuracy                           0.91      1188
   macro avg       0.92      0.91      0.91      1188
weighted avg       0.92      0.91      0.91      1188



In [51]:
# similar result is coming: not so much improvement even after text-pre-processing

# attempt 2:
* using TF-IDF vectorizer for pre-processing the text.
* use RandomForest as the classifier.
* print the classification report.

In [56]:
clf = Pipeline([
    ('cv', TfidfVectorizer()),  
    ('rf', RandomForestClassifier())
]) 

In [57]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.94       400
           1       0.91      0.91      0.91       388
           2       0.94      0.90      0.92       400

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



# observation:

1. with random forest: with 3-gram-- -result was worst.
2. with naive bayes and random forest (uni-and bi-gram) result improved above 90%(acc+precision+f1-score)
3. after pre-processing the text the performance of countvectorizer remained the same with those algorithms.
4. but with tf-idf vectorizer on processed text the result was the best::