IMPORTING DEPENDENCIES

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

DATA COLLECTION AND PRE-PROCESSING

In [None]:
# loading the data from csv file to a pandas Dataframe
raw_data = pd.read_csv('/content/train.csv')

In [None]:
print(raw_data)

                                                    sms  label
0     Go until jurong point, crazy.. Available only ...      0
1                       Ok lar... Joking wif u oni...\n      0
2     Free entry in 2 a wkly comp to win FA Cup fina...      1
3     U dun say so early hor... U c already then say...      0
4     Nah I don't think he goes to usf, he lives aro...      0
...                                                 ...    ...
5569  This is the 2nd time we have tried 2 contact u...      1
5570             Will ü b going to esplanade fr home?\n      0
5571  Pity, * was in mood for that. So...any other s...      0
5572  The guy did some bitching but I acted like i'd...      0
5573                       Rofl. Its true to its name\n      0

[5574 rows x 2 columns]


In [None]:
raw_data.isnull().sum()

sms      0
label    0
dtype: int64

In [None]:
# replace the null values with a null string
data = raw_data.where((pd.notnull(raw_data)),'')

In [None]:
# printing the first 5 rows of the dataframe
data.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
data.shape

(5574, 2)

In [None]:
# separating the data as texts and label
X = data['sms']
Y = data['label']

In [None]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                         Ok lar... Joking wif u oni...\n
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5569    This is the 2nd time we have tried 2 contact u...
5570               Will ü b going to esplanade fr home?\n
5571    Pity, * was in mood for that. So...any other s...
5572    The guy did some bitching but I acted like i'd...
5573                         Rofl. Its true to its name\n
Name: sms, Length: 5574, dtype: object


In [None]:
print(Y)

0       0
1       0
2       1
3       0
4       0
       ..
5569    1
5570    0
5571    0
5572    0
5573    0
Name: label, Length: 5574, dtype: int64


SPLITTING DATA INTO TRAIN & TEST



In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5574,)
(4459,)
(1115,)


FEATURE SELECTION

In [None]:
# transform the text data to feature vectors that can be used as input to the Logistic regression
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print(X_train)

1379                          Ya tel, wats ur problem..\n
1742    I can do that! I want to please you both insid...
2212    Just gettin a bit arty with my collages at the...
4368    I like dis sweater fr mango but no more my siz...
3352                       At what time are you coming.\n
                              ...                        
3335    That's fine, have him give me a call if he kno...
1099    NO GIFTS!! You trying to get me to throw mysel...
2514    U have won a nokia 6230 plus a free digital ca...
3606                    Jordan got voted out last nite!\n
2575    Your next amazing xxx PICSFREE1 video will be ...
Name: sms, Length: 4459, dtype: object


In [None]:
print(X_train_features)

  (0, 5253)	0.4280631303519661
  (0, 6968)	0.2818952441915576
  (0, 7173)	0.5619001229363815
  (0, 6557)	0.4987798138495613
  (0, 7404)	0.41567292054659266
  (1, 1260)	0.5581909204207409
  (1, 4846)	0.5155372642290921
  (1, 3573)	0.5581909204207409
  (1, 7144)	0.3332673430008943
  (2, 2515)	0.2280544514081507
  (2, 5872)	0.23218052847213733
  (2, 4095)	0.2555708218847741
  (2, 4339)	0.2067237868122641
  (2, 5621)	0.30290626205024374
  (2, 3095)	0.16057089883447834
  (2, 7176)	0.18878001066200234
  (2, 4567)	0.30290626205024374
  (2, 6836)	0.3290967128508977
  (2, 4395)	0.2682843693190939
  (2, 1830)	0.3290967128508977
  (2, 1054)	0.3290967128508977
  (2, 1328)	0.21884923158074512
  (2, 3020)	0.27671581124958966
  (2, 3741)	0.14053404852263135
  (3, 3622)	0.4049447096763992
  :	:
  (4456, 664)	0.2873593236179065
  (4456, 1110)	0.23938839224470346
  (4456, 5081)	0.22171192927362404
  (4456, 7267)	0.18864768603549034
  (4456, 1584)	0.20768133128462812
  (4456, 2245)	0.2534189902336994
  (

TRAINING MODEL

LOGISTIC REGRESSION

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

MODEL EVALUATION

In [None]:
print('Accuracy on training data:', model.score(X_train_features, Y_train)*100)
print('Accuracy on test data:', model.score(X_test_features, Y_test) * 100)

Accuracy on training data: 96.92756223368468
Accuracy on test data: 95.33632286995515


In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_features)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       951
           1       0.98      0.70      0.81       164

    accuracy                           0.95      1115
   macro avg       0.97      0.85      0.89      1115
weighted avg       0.95      0.95      0.95      1115



In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(Y_test, y_pred))

[[949   2]
 [ 50 114]]


RANDOM FOREST CLASSIFIER ALGORITHM

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [None]:
model.fit(X_train_features, Y_train)

In [None]:
print('Accuracy on training data:', model.score(X_train_features, Y_train)*100)
print('Accuracy on test data:', model.score(X_test_features, Y_test) * 100)

Accuracy on training data: 100.0
Accuracy on test data: 97.48878923766816


DECISION TREE CLASSIFIER ALGORITHM

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train_features, Y_train)

In [None]:
print('Accuracy on training data:', model.score(X_train_features, Y_train)*100)
print('Accuracy on test data:', model.score(X_test_features, Y_test) * 100)

Accuracy on training data: 100.0
Accuracy on test data: 97.04035874439462


K NEAREST NEIGHBORS CLASSIFIER

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train_features, Y_train)

In [None]:
print('Accuracy on training data:', model.score(X_train_features, Y_train)*100)
print('Accuracy on test data:', model.score(X_test_features, Y_test) * 100)

Accuracy on training data: 92.17313298945952
Accuracy on test data: 89.14798206278027


LINEAR REGRESSION MODEL

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_features, Y_train)

In [None]:
print('Accuracy on training data:', model.score(X_train_features, Y_train)*100)
print('Accuracy on testing data:', model.score(X_test_features, Y_test) * 100)

Accuracy on training data: 99.98135967533113
Accuracy on testing data: 76.83034154160153


BUILDING A PREDICTIVE SYSTEM

0-> Non-Spam SMS

1-> Spam SMS

In [None]:
input_sms = ['Hi Sarah, this is Mike. Just wanted to remind you about the team meeting tomorrow at 3 PM. Your insights will be valuable. Looking forward to seeing you there!']

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_sms)

# making prediction
prediction = model.predict(input_data_features)
print(prediction)

if (prediction[0]==1):
  print('Spam SMS')
else:
  print('Non-Spam SMS')

[0]
Non-Spam SMS


In [None]:
input_sms = ['URGENT: Congratulations! You have won a $1000 gift card. Claim it now by clicking on the link: [fake-link.com]. Dont miss out on this amazing offer!']

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_sms)

# making prediction
prediction = model.predict(input_data_features)
print(prediction)

if (prediction[0]==1):
  print('Spam SMS')
else:
  print('Non-Spam SMS')

[1]
Spam SMS
