In [199]:
import pandas as pd
import numpy as np

In [200]:
df = pd.read_csv("spam.csv")

In [201]:
df.head(3)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [202]:
## Applying function to convert the Category into 1 and 0
def convert(text):
    if text == "spam":
        return 1
    else:
        return 0

In [203]:
# column. apply(function_) ---> 
df['spam'] = df['Category'].apply(convert)

In [204]:
df.sample(2)

Unnamed: 0,Category,Message,spam
2571,ham,From 5 to 2 only my work timing.,0
370,ham,"Cool, text me when you're ready",0


In [205]:
# df['spam'] = df['Category'].apply(lambda x: 1 if x == "spam" else 0)

In [206]:
# df.sample(9)

## Basic sequence to follow:
* Loading the dataset
* Split the dataset into training and testing sets
* TF-IDF/CountVectorizer/Others Vectorization
* Naive Bayes classifier
* Predictions on the test set
* Evaluation

In [224]:
# train test split:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['spam'], test_size=0.2, random_state=3)

In [225]:
df.shape, X_train.shape

((5572, 3), (4457,))

In [226]:
type(X_train), type(X_train.values)

(pandas.core.series.Series, numpy.ndarray)

# Feature engineering: countvectorizer

In [227]:

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train.values)

In [228]:
type(X_train_cv)

scipy.sparse._csr.csr_matrix

In [229]:
X_train_cv.toarray().shape # 7701 unique words in our vocabulary.

(4457, 7694)

In [230]:
# getting the names of the differnet tokens.
cv.get_feature_names_out()[7690:7701]

array(['zyada', 'èn', 'ú1', '〨ud'], dtype=object)

In [231]:
cv.get_feature_names_out().shape

(7694,)

In [232]:
type(X_train_cv)

scipy.sparse._csr.csr_matrix

In [233]:
# let's convert this to numpy array:
X_train_np = X_train_cv.toarray()

In [234]:
X_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [235]:
y_train.head(4)

3075    0
1787    0
1614    0
4304    0
Name: spam, dtype: int64

# Model 

In [236]:
X_test_cv = cv.transform(X_test.values)
X_test_np = X_test_cv.toarray()

In [237]:
X_test_cv.shape

(1115, 7694)

In [238]:
# we've our data ready, now we can apply classification model.
from sklearn.naive_bayes import MultinomialNB
model  = MultinomialNB()
model.fit(X_train_cv, y_train)

In [239]:
y_pred = model.predict(X_test_cv)

In [241]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       960
           1       0.99      0.94      0.96       155

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



## Testing:

In [254]:
emails = ['Hi, want boom in your career, call 42532249445', 'You were right', 'offer for only']
email_cv = cv.transform(emails)
model.predict(email_cv)

array([0, 0, 1], dtype=int64)

# sklearn pipeline

In [261]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())])
clf.fit(X_train, y_train)


In [262]:
y_Pred = clf.predict(X_test)

In [264]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       960
           1       0.99      0.94      0.96       155

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



# Exercises

# Bag of words:
* In this Exercise, you are going to classify whether a given movie review is positive or neg
* ative.
* you are going to use Bag of words for pre-processing the text and apply different classification algorithms.
* Sklearn CountVectorizer has the inbuilt implementations for Bag of Words.

## About Data: IMDB Dataset
Credits: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?resource=download

This data consists of two columns. - review - sentiment
Reviews are the statements given by users after watching the movie.
sentiment feature tells whether the given review is positive or negative.

In [296]:
df = pd.read_csv("IMDB Dataset.csv")

In [297]:
df.head(4)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative


In [298]:
df.shape

(50000, 2)

In [309]:
df = df[:2000]

In [310]:
df.isnull().mean()
# no missing values in any column

review       0.0
sentiment    0.0
Category     0.0
dtype: float64

In [312]:
df['sentiment'].value_counts()
# balanced target labels:

sentiment
positive    1005
negative     995
Name: count, dtype: int64

In [313]:
len(df['review'][1])

998

In [314]:
#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative
df['Category'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [315]:
df.sample(5)

Unnamed: 0,review,sentiment,Category
1160,I have a 19-month old and got really tired of ...,positive,1
848,"First off, I have no idea how this movie made ...",negative,0
60,What happened? What we have here is basically ...,negative,0
1572,"I don't really consider myself a conservative,...",negative,0
739,"I enjoyed it. There you go, I said it again. I...",positive,1


In [316]:
#Do the 'train-test' splitting with test size of 20%
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['Category'], test_size=0.2)

### Exercise-1
1. using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.eport.

### Note:

* use CountVectorizer for pre-processing the 
* use Random Forest as the classifier with ```estimators as 50``` and criterion as ```entropy.```
* print the classification report.

In [317]:
from sklearn.ensemble import RandomForestClassifier

In [330]:
model1 = Pipeline([
    ('countvectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=50,criterion= 'entropy'))])
model1.fit(X_train, y_train)

In [331]:
y_pred = model1.predict(X_test)

In [332]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.81      0.79       191
           1       0.82      0.79      0.80       209

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400



## Exercise-2

* use CountVectorizer for pre-processing the text.
* use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean'.
* print the classification report.


In [333]:
from  sklearn.neighbors import KNeighborsClassifier

In [334]:
model_knn = Pipeline([
    ('cv', CountVectorizer()), 
    ('knn', KNeighborsClassifier())])
model_knn.fit(X_train, y_train)

In [335]:
y_pred= model_knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.46      0.49       191
           1       0.56      0.65      0.60       209

    accuracy                           0.56       400
   macro avg       0.55      0.55      0.55       400
weighted avg       0.55      0.56      0.55       400



## Exercise-3
* use CountVectorizer for pre-processing the text.
* use Multinomial Naive Bayes as the classifier.
* print the classification report.

In [336]:
from sklearn.naive_bayes import MultinomialNB


In [337]:
model_nb = Pipeline([
    ('cv', CountVectorizer()),
    ('nb', MultinomialNB())])
model_nb.fit(X_train, y_train)

In [338]:
y_pred = model_nb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.84      0.81       191
           1       0.84      0.79      0.81       209

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400



In [339]:
# We can see that the KNN performed very bad on the dataset; with acc, precision, recall lesser than 50%.

### Reason why kNN failed
* curse of dimesionality: with higher no of features model fail
* Count vectorizer creating sparse matrix