In [2]:
!pip install --quiet pandas


[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
!pip install --quiet scikit-learn


[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## Bag of Words | Count Vectorizer

In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [89]:
df = pd.read_csv('assets/spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [90]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [91]:
df['spam'] = df['Category'].apply(lambda x:1 if x=='spam' else 0)

In [92]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [93]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [94]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [16]:
type(X_train), type(X_train.values)

(pandas.core.series.Series, numpy.ndarray)

In [17]:
count = CountVectorizer()

X_train_cv = count.fit_transform(X_train.values)

In [18]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
X_train_cv.shape

(4457, 7664)

In [23]:
count.get_feature_names_out()[1000:1050]

array(['apeshit', 'aphex', 'apo', 'apologise', 'apologize', 'apology',
       'app', 'apparently', 'appear', 'applausestore', 'applebees',
       'apples', 'application', 'apply', 'applyed', 'applying',
       'appointment', 'appointments', 'appreciate', 'appreciated',
       'approaching', 'appropriate', 'approve', 'approx', 'apps', 'appt',
       'appy', 'april', 'aproach', 'apt', 'aptitude', 'aquarius', 'ar',
       'arab', 'arabian', 'arcade', 'ard', 'are', 'area', 'aren', 'arent',
       'arestaurant', 'aretaking', 'areyouunique', 'argh', 'argue',
       'arguing', 'argument', 'arguments', 'arise'], dtype=object)

In [24]:
count.get_feature_names_out().shape

(7664,)

In [25]:
dir(count)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_

In [26]:
count.vocabulary_

{'fwiw': 3020,
 'the': 6752,
 'reason': 5569,
 'only': 4907,
 'around': 1059,
 'when': 7406,
 'it': 3719,
 'time': 6835,
 'to': 6869,
 'smoke': 6198,
 'is': 3709,
 'that': 6748,
 'because': 1262,
 'of': 4855,
 'gas': 3048,
 'can': 1613,
 'afford': 837,
 'be': 1252,
 'someone': 6239,
 'tells': 6693,
 'me': 4360,
 'and': 952,
 'apparently': 1007,
 'happens': 3294,
 'somebody': 6237,
 'wants': 7303,
 'light': 4067,
 'up': 7111,
 'orange': 4942,
 'brings': 1490,
 'you': 7625,
 'ringtones': 5741,
 'from': 2984,
 'all': 902,
 'chart': 1722,
 'heroes': 3375,
 'with': 7469,
 'free': 2950,
 'hit': 3406,
 'each': 2474,
 'week': 7359,
 'go': 3124,
 'pics': 5167,
 'on': 4895,
 'wap': 7304,
 'stop': 6437,
 'receiving': 5582,
 'these': 6771,
 'tips': 6842,
 'reply': 5664,
 'no': 4755,
 'da': 2126,
 'today': 6875,
 'also': 918,
 'forgot': 2917,
 'will': 7442,
 'in': 3604,
 'place': 5192,
 'man': 4291,
 'oh': 4871,
 'send': 5954,
 'address': 804,
 'new': 4717,
 'number': 4817,
 'just': 3833,
 'come': 

In [28]:
np.where(X_train_cv.toarray()[0] != 0)

(array([ 837,  952, 1007, 1059, 1252, 1262, 1613, 3020, 3048, 3294, 3709,
        3719, 4067, 4360, 4855, 4907, 5569, 6198, 6237, 6239, 6693, 6748,
        6752, 6835, 6869, 7111, 7303, 7406], dtype=int64),)

In [43]:
X_train[:1][2504]

"Fwiw the reason I'm only around when it's time to smoke is that because of gas I can only afford to be around when someone tells me to be and that apparently only happens when somebody wants to light up"

In [45]:
X_train_cv.toarray()[0][837]

1

In [48]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [49]:
X_test_cv = count.transform(X_test)

In [52]:
y_pred = model.predict(X_test_cv)

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       942
           1       0.95      0.91      0.93       173

    accuracy                           0.98      1115
   macro avg       0.96      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



**Note**  
For imbalanced dataset, consider f1-score.

In [86]:
emails = [
    "Hello Hinata, do you wanna go out with me? We could watch some movie or shit and maybe we could hangout at me place.",
    "Upto 20% discount on Graphics Cards! Hurry up, the sale is valid till this weekend!"
]

In [87]:
emails_count = count.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

**Easier way to do it:**

In [88]:
from sklearn.pipeline import Pipeline

classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [95]:
classifier.fit(X_train, y_train)

In [100]:
y_pred = classifier.predict(X_test)

In [101]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       973
           1       0.99      0.93      0.96       142

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [102]:
classifier.predict(emails)

array([0, 1], dtype=int64)

## Exercise

In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [103]:
df = pd.read_csv('assets/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [104]:
df.shape

(50000, 2)

In [105]:
df['Category'] = df['sentiment'].apply(lambda row: 1 if row=='positive' else 0)
df.head()

Unnamed: 0,review,sentiment,Category
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [106]:
df['Category'].value_counts()

Category
1    25000
0    25000
Name: count, dtype: int64

In [107]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['Category'], test_size=0.2)
X_train.shape, X_test.shape

((40000,), (10000,))

**Random Forest**

In [111]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),                                                    
    ('random_forest', (RandomForestClassifier(n_estimators=50, criterion='entropy')))      
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      5023
           1       0.84      0.84      0.84      4977

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



**KNN**

In [112]:
clf = Pipeline([
     ('vectorizer', CountVectorizer()),   
      ('KNN', (KNeighborsClassifier(n_neighbors=10, metric = 'euclidean')))
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.66      0.65      5023
           1       0.65      0.63      0.64      4977

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000



**Naive Bayes**

In [113]:
clf = Pipeline([
                
     ('vectorizer', CountVectorizer()),   
      ('Multi NB', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      5023
           1       0.88      0.83      0.85      4977

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



1) As Machine learning algorithms does not work on Text data directly, we need to convert them into numeric vector and feed that into models while training.
2) In this process, we convert text into a very high dimensional numeric vector using the technique of Bag of words.
3) Model like K-Nearest Neighbours(KNN) doesn't work well with high dimensional data because with large number of dimensions, it becomes difficult for the algorithm to calculate distance in each dimension. In higher dimensional space, the cost to calculate distance becomes expensive and hence impacts the performance of model.
4) The easy calculation of probabilities for the words in corpus(Bag of words) and storing them in contigency table is the major reason for the Multinomial NaiveBayes to be a text classification friendly algorithm.
5) As Random Forest uses Bootstrapping(Row and column Sampling) with many decision tree and overcomes the high variance and overfitting of high dimensional data and also uses feature importance of words for better classifing the categories.
6) Machine Learning is like trial and error scientific method, where we keep trying all the possible algorithms we have and select the one which give good results and satisfy the requirements like latency, interpretability etc.