In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [None]:
# example text for model training (SMS messages)
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']

# example response vector
y = [0, 0, 1]


# example text for model testing
simple_test = ["please don't call me"]

In [None]:
vect = CountVectorizer()

In [None]:
# learn the 'vocabulary' of the training data
vect.fit(simple_train)

CountVectorizer()

In [None]:
vect.get_feature_names_out()

array(['cab', 'call', 'me', 'please', 'tonight', 'you'], dtype=object)

In [None]:
simple_train_dtm = vect.transform(simple_train)

In [None]:
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [None]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]])

In [None]:
pd.DataFrame(simple_train_dtm.toarray(),columns=vect.get_feature_names_out())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [None]:
# Build a model to predict the target
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(simple_train_dtm, y)

KNeighborsClassifier(n_neighbors=1)

In [None]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.fit_transform(simple_test)
simple_test_dtm.toarray()

array([[1, 1, 1, 1]])

In [None]:
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())



Unnamed: 0,call,don,me,please
0,1,1,1,1


**Summary:**

- `vect.fit(train)` **learns the vocabulary** of the training data
- `vect.transform(train)` uses the **fitted vocabulary** to build a document-term matrix from the training data
- `vect.transform(test)` uses the **fitted vocabulary** to build a document-term matrix from the testing data and **ignores tokens** it hasn't seen before

In [None]:
sms = pd.read_table('sms.tsv',header=None,names = ['label', 'message'])

In [None]:
# examine the shape
sms.shape

(5572, 2)

In [None]:
# examine the first 10 rows
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [None]:
# examine the class distribution
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [None]:
sms.label.value_counts(1)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [None]:
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [None]:
sms.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [None]:
# instantiate the vectorizer
vect = CountVectorizer()

In [None]:
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [None]:
# examine the document-term matrix
X_train_dtm

<4179x7450 sparse matrix of type '<class 'numpy.int64'>'
	with 55393 stored elements in Compressed Sparse Row format>

In [None]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7450 sparse matrix of type '<class 'numpy.int64'>'
	with 17416 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ss = StandardScaler(with_mean=False)

In [None]:
X_train_scaled = ss.fit_transform(X_train_dtm)

In [None]:
X_train_scaled

<4179x7450 sparse matrix of type '<class 'numpy.float64'>'
	with 55393 stored elements in Compressed Sparse Row format>

In [None]:
X_test_scaled = ss.transform(X_test_dtm)

In [None]:
X_test_scaled

<1393x7450 sparse matrix of type '<class 'numpy.float64'>'
	with 17416 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
nncl = MLPClassifier(hidden_layer_sizes=(100,),
                    max_iter = 5000,
                    solver = 'sgd',
                    activation = 'logistic',
                    random_state = 1,
                    verbose=True,
                    tol = 0.01,
                    batch_size=20,
                    learning_rate = 'adaptive',
                    learning_rate_init= 0.01)

In [None]:

nncl.fit(X_train_scaled,y_train)

Iteration 1, loss = 0.20610888
Iteration 2, loss = 0.01635206
Iteration 3, loss = 0.00640968
Iteration 4, loss = 0.00412298
Iteration 5, loss = 0.00305457
Iteration 6, loss = 0.00243861
Iteration 7, loss = 0.00203682
Iteration 8, loss = 0.00175528
Iteration 9, loss = 0.00154784
Iteration 10, loss = 0.00138848
Iteration 11, loss = 0.00126190
Iteration 12, loss = 0.00115978
Iteration 13, loss = 0.00107561
Training loss did not improve more than tol=0.010000 for 10 consecutive epochs. Setting learning rate to 0.002000
Iteration 14, loss = 0.00102373
Iteration 15, loss = 0.00101040
Iteration 16, loss = 0.00099749
Iteration 17, loss = 0.00098513
Iteration 18, loss = 0.00097306
Iteration 19, loss = 0.00096136
Iteration 20, loss = 0.00095003
Iteration 21, loss = 0.00093903
Iteration 22, loss = 0.00092835
Iteration 23, loss = 0.00091796
Iteration 24, loss = 0.00090788
Training loss did not improve more than tol=0.010000 for 10 consecutive epochs. Setting learning rate to 0.000400
Iteration 25,

MLPClassifier(activation='logistic', batch_size=20, learning_rate='adaptive',
              learning_rate_init=0.01, max_iter=5000, random_state=1,
              solver='sgd', tol=0.01, verbose=True)

In [None]:
nncl.score(X_train_scaled,y_train)

1.0

In [None]:
nncl.score(X_test_scaled,y_test)

0.9777458722182341

# Thank You ! Rudhra