In [1]:
#import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms
%matplotlib inline

In [2]:
train_data = pd.read_csv("train.csv",encoding="ISO-8859-1")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
v1            5572 non-null object
v2            5572 non-null object
Unnamed: 2    50 non-null object
Unnamed: 3    12 non-null object
Unnamed: 4    6 non-null object
dtypes: object(5)
memory usage: 217.7+ KB


In [4]:
train_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


No null values in the data. But there are columns which does not have any use, so let's drop them.

In [5]:
def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)

In [6]:
drop_features(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],train_data)

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
v1    5572 non-null object
v2    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [8]:
train_data = train_data.rename(columns={"v1":"target_label","v2":"sms_text"})

In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
target_label    5572 non-null object
sms_text        5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [10]:
train_data['target_label'].value_counts()

ham     4825
spam     747
Name: target_label, dtype: int64

Let's convert the categorical target label into numerical. Replace 'Ham' with 0 and 'Spam' with 1

In [11]:
train_data['numerical_target_label'] = train_data.target_label.map({'ham':0,'spam':1})

In [12]:
drop_features(['target_label'],train_data)

In [13]:
train_data.head()

Unnamed: 0,sms_text,numerical_target_label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(train_data["sms_text"],train_data["numerical_target_label"], test_size = 0.2, random_state = 42)

Text preprocessing, tokenizing and filtering of stopwords are included in a high level component 
that is able to build a dictionary of features and transform documents to feature vectors.

CountVectorizer supports counts of N-grams of words or consecutive characters. Once fitted, the vectorizer has built a dictionary of feature indices.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [17]:
x_train_counts = count_vect.fit_transform(x_train)
x_train_counts.shape

(4457, 7735)

From occurrences to frequencies

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

In [19]:
tf_transformer = TfidfTransformer(use_idf=False)
tf_transformer = tf_transformer.fit(x_train_counts)
x_train_tf = tf_transformer.transform(x_train_counts)
x_train_tf.shape

(4457, 7735)

In [20]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train_tf,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Preparing the test data

In [21]:
x_test_counts = count_vect.transform(x_test)
x_test_counts.shape

(1115, 7735)

In [22]:
x_test_tf = tf_transformer.transform(x_test_counts)
x_test_tf.shape

(1115, 7735)

In [23]:
predictions = model.predict(x_test_tf)

Let's see how good are our predictions.

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [25]:
accuracy_score(y_test,predictions)

0.96771300448430497

In [26]:
confusion_matrix(y_test,predictions)

array([[964,   1],
       [ 35, 115]], dtype=int64)

The above confusion matrix states that we classified (964 + 115) = 1079 out of 1115 texts correctly. 
Moreover, we only classify 1 real text (~0.1%!) as spam when it was a real SMS.