In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("mail_data.csv")

In [3]:
df.shape

(5572, 2)

In [4]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [7]:
# if there were null values, this is how we would have tackeled those

df = df.where((pd.notnull(df)),'')

In [8]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Label Encoding

In [9]:
labels = {"spam":0,"ham":1}
df["Category"] = df["Category"].map(labels)

In [10]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
X = df["Message"]
Y = df["Category"]

In [12]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [13]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int64


# Splitting the data

In [14]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,stratify = Y,test_size = 0.2,random_state = 3)

In [15]:
print(X.shape,X_train.shape,X_test.shape)

(5572,) (4457,) (1115,)


# Feature Extraction

In [18]:
# transform the text data to feature vectors that can be used as the input to logistic regression model

feature_extraction = TfidfVectorizer(min_df = 1,stop_words = "english",lowercase = True)

In [20]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)  # we do not fit test data

# convert y-train and y-test value as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [21]:
print(X_train)

3501    Dorothy@kiefer.com (Bank of Granite issues Str...
617     He like not v shock leh. Cos telling shuhui is...
475     Nice line said by a broken heart- Plz don't cu...
5535    I know you are thinkin malaria. But relax, chi...
4747           Orh i tot u say she now still dun believe.
                              ...                        
4402         Many times we lose our best ones bcoz we are
3615                                         Ok c ü then.
4763                      Me too! Have a lovely night xxx
4339                          Yes when is the appt again?
1827    Dude. What's up. How Teresa. Hope you have bee...
Name: Message, Length: 4457, dtype: object


In [22]:
print(X_train_features)

  (0, 0)	0.23628394623676158
  (0, 1657)	0.28101404009316056
  (0, 6468)	0.26793132631329497
  (0, 4557)	0.28101404009316056
  (0, 421)	0.25144905621529934
  (0, 4306)	0.26793132631329497
  (0, 5029)	0.17467075796896542
  (0, 2644)	0.28101404009316056
  (0, 1540)	0.17407870571957915
  (0, 6330)	0.24059246244542992
  (0, 3627)	0.25144905621529934
  (0, 3113)	0.28101404009316056
  (0, 1193)	0.22908400928709988
  (0, 1857)	0.17073786814794129
  (0, 3806)	0.28101404009316056
  (0, 2353)	0.28101404009316056
  (1, 4076)	0.1543395674723974
  (1, 5416)	0.28967873139399253
  (1, 6977)	0.1293522168838017
  (1, 765)	0.21147006367289747
  (1, 1068)	0.1771111381363262
  (1, 3089)	0.13752009582621935
  (1, 3961)	0.20073435617244362
  (1, 3828)	0.13684128003316173
  (1, 2113)	0.19851614641109666
  :	:
  (4456, 3913)	0.23883125341667502
  (4456, 2138)	0.23883125341667502
  (4456, 6568)	0.23883125341667502
  (4456, 6646)	0.22771237505351186
  (4456, 5447)	0.2198234053076842
  (4456, 4878)	0.21370424497

# Training the model

In [23]:
model = LogisticRegression()

In [24]:
# traning the logistic regression model with training data
model.fit(X_train_features,Y_train)

# Evaluating the trained model

In [26]:
# prediction on training data

X_train_prediction = model.predict(X_train_features)
X_train_accuracy = accuracy_score(X_train_prediction,Y_train)

print("Accuracy of model on training data is",X_train_accuracy*100,"%")

Accuracy of model on training data is 96.70181736594121 %


In [27]:
# prediction on test data

X_test_prediction = model.predict(X_test_features)
X_test_accuracy = accuracy_score(X_test_prediction,Y_test)

print("Accuracy of model on test data is",X_test_accuracy*100,"%")

Accuracy of model on test data is 97.21973094170404 %


# Making Predictions

In [29]:
def predictmodel(input_data):
    # converting the data feature vectors
    input_data_features = feature_extraction.transform(input_data)

    # make predictions
    prediction = model.predict(input_data_features)

    if(prediction == 1):
        print("Not a spam,this is HAM")
    else:
        print("SPAM")

In [30]:
predictmodel(["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."])

Not a spam,this is HAM


In [31]:
predictmodel(["URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18"])

SPAM
