In [2]:
import pandas as pd
import numpy as np

# import plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import  MultinomialNB

In [8]:
sms = pd.read_csv(r'U:\AI-DATASETS\01-MISC\sms.tsv', 
                  header=None, 
                  sep='\t',
                  names=['label', 'message'])

In [9]:
sms.sample(5)

Unnamed: 0,label,message
4536,ham,IM LATE TELLMISS IM ON MY WAY
1314,ham,How abt making some of the pics bigger?
4613,ham,Sorry da. I gone mad so many pending works wha...
474,spam,Want 2 get laid tonight? Want real Dogging loc...
2612,spam,Knock Knock Txt whose there to 80082 to enter ...


In [10]:
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham': 0, 'spam': 1})

In [11]:
X = sms.message
y = sms.label_num

In [14]:
from sklearn.model_selection import train_test_split

In [16]:
# split X and y into training ans testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) 

In [17]:
X_train.shape, X_test.shape

((4179,), (1393,))

#### create embedding or vectors

In [18]:
vect = CountVectorizer()      # initialization of the vectorizer

In [19]:
vect.fit(X_train, y_train)    # creates the BOW or dict of unique words

In [29]:
vect.get_feature_names_out()  # display the BOW (learnt)

array(['00', '000', '008704050406', ..., 'zyada', 'èn', '〨ud'],
      dtype=object)

In [30]:
len(vect.get_feature_names_out())   # how many unique words in the BOW

7456

In [31]:
X_train

710     4mths half price Orange line rental & latest c...
3740                           Did you stitch his trouser
2711    Hope you enjoyed your new content. text stop t...
3155    Not heard from U4 a while. Call 4 rude chat pr...
3748    Ü neva tell me how i noe... I'm not at home in...
                              ...                        
905     We're all getting worried over here, derek and...
5192    Oh oh... Den muz change plan liao... Go back h...
3980    CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235     Text & meet someone sexy today. U can find a d...
5157                              K k:) sms chat with me.
Name: message, Length: 4179, dtype: object

In [32]:
X_train_dtm = vect.transform(X_train)    # will convert the training messages into vector form

In [33]:
X_train_dtm

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [34]:
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### train a ML model (KNN)

In [35]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_dtm, y_train)

#### convert the test data into the BOW rep

In [38]:
X_test_dtm = vect.transform(X_test)   

In [39]:
y_pred     = knn.predict(X_test_dtm)

In [40]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [41]:
accuracy_score(y_test, y_pred)

0.927494615936827

#### random prediction

In [50]:
text = [
    "WIN: We have a winner in you. Certain prize!"]

In [51]:
X_val_dtm = vect.transform(text)  

In [52]:
X_val_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [53]:
knn.predict(X_val_dtm)

array([0], dtype=int64)

In [54]:
knn.predict_proba(X_val_dtm)

array([[1., 0.]])

In [49]:
X_test

1078                         Yep, by the pretty sculpture
4028        Yes, princess. Are you going to make me moan?
958                            Welp apparently he retired
4642                                              Havent.
4674    I forgot 2 ask ü all smth.. There's a card on ...
                              ...                        
3207                                        At home also.
4655                     Hope you are having a great day.
1140    Message:some text missing* Sender:Name Missing...
1793    WIN: We have a winner! Mr. T. Foley won an iPo...
1710    U meet other fren dun wan meet me ah... Muz b ...
Name: message, Length: 1393, dtype: object

#### try out Log reg

#### try out NB