<a href="https://colab.research.google.com/github/SakethMattupalli/Spam-vs-Ham/blob/master/spam_classifier_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/NLP-Tutorial---Spam-Text-Message-Classification-using-NLP/master/spam.tsv', sep= '\t')
data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
#check for any null values
data.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
#check how many spam and ham msgs
data.label.value_counts()
#It seems like unbalanced data,

ham     4825
spam     747
Name: label, dtype: int64

In [37]:
#balance the data
ham = data[data['label'] == 'ham']
ham = ham.sample(747) # balace ham and spam == 747
ham.shape

(747, 4)

In [38]:
spam = data[data['label'] == 'spam']
spam.shape

(747, 4)

In [41]:
data = ham.append(spam, ignore_index=True)
data.shape

(1494, 4)

In [72]:
X = data.message
Y = data.label
X.shape, Y.shape

((1494,), (1494,))

In [0]:
#Split the data into Training and Testing

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size= 0.3, stratify = data.label,random_state = 0)

In [0]:
#Initialize a Tfid or CountVectorizer

vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)

In [78]:
# Initialize the classifier 

clf = RandomForestClassifier(n_estimators= 150)
clf.fit(x_train, y_train) #Training the model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
# Predict (Test) by using test set
x_test = vectorizer.transform(x_test) #The problem here is we should transform the values, we can overcome this by pipeline
y_pred = clf.predict(x_test)

In [62]:
confusion_matrix(y_pred, y_test)

array([[225,  26],
       [  0, 198]])

In [48]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         ham       1.00      0.91      0.95       165
        spam       0.90      1.00      0.95       134

    accuracy                           0.95       299
   macro avg       0.95      0.95      0.95       299
weighted avg       0.95      0.95      0.95       299



In [64]:
#custom message

msg = "Congratulation you have won free mobile and free laptop"
msg = vectorizer.transform([msg]) #The problem here is we should transform the values, we can overcome this by pipeline
clf.predict(msg)

array(['spam'], dtype=object)

In [0]:
#Pipeline model,  used Random Forest
# for testing(x_test) and for custom messages it can directly first vectorize and then classify it

clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', SVC(C= 1000, gamma='auto'))])

In [0]:
#Split the data into Training and Testing

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size= 0.3, stratify = data.label,random_state = 0)

In [95]:
clf.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1000, break_ties=False, cache_size=200,
                     class_weight=None, coef0=0.0,
                     decision_fun

In [0]:
y_pred = clf.predict(x_test)

In [97]:
confusion_matrix(y_pred, y_test)

array([[219,  19],
       [  6, 205]])

In [98]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.92      0.97      0.95       225
        spam       0.97      0.92      0.94       224

    accuracy                           0.94       449
   macro avg       0.95      0.94      0.94       449
weighted avg       0.95      0.94      0.94       449



In [105]:
#custom message

msg = "congratulations you have won free gifts"
# msg = vectorizer.transform([msg]) # not needed if using pipeline
clf.predict([msg])

array(['spam'], dtype=object)