In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

**Data Collection and preprocessing**

In [None]:
df_mail = pd.read_csv('mail_data.csv')

In [None]:
df_mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#replacing null values with a null string
mail_data = df_mail.where((pd.notnull(df_mail)),'')

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_data.shape

(5572, 2)

Label Encoding the category column

In [None]:
#spam mail : 0 ham mail : 1
mail_data.loc[mail_data['Category']== 'spam','Category',] = 0
mail_data.loc[mail_data['Category']== 'ham','Category',] = 1

In [None]:
#seperating the data as texts and label
x = mail_data['Message']
y = mail_data['Category']

In [None]:
#Splitting the data into train test split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=3)

In [None]:
print(x.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


Feature Extraction


In [None]:
# transform text data to feature vectors that can be used as input to logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test) #no need the model to look at this data

#Convert Y_train and Y_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [None]:
print(X_train_features)

  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347
  (1, 3811)	0.17419952275504033
  (1, 3046)	0.2503712792613518
  (1, 1991)	0.33036995955537024
  (1, 2956)	0.33036995955537024
  (1, 2758)	0.3226407885943799
  (1, 1839)	0.2784903590561455
  (1, 918)	0.22871581159877646
  (1, 2746)	0.3398297002864083
  (1, 2957)	0.3398297002864083
  (1, 3325)	0.31610586766078863
  (1, 3185)	0.29694482957694585
  (1, 4080)	0.18880584110891163
  (2, 6601)	0.6056811524587518
  (2, 2404)	0.45287711070606745
  (2, 3156)	0.4107239318312698
  (2, 407)	0.509272536051008
  (3, 7414)	0.8100020912469564
  (3, 2870)	0.5864269879324768
  (4, 2870)	0.41872147309323743
  (4, 487)	0.2899118421746198
  :	:
  (4454, 2855)	0.47210665083641806
  (4454, 2246)	0.47210665083641806
  (4455, 4456)	0.24920025316220423
  (4455, 3922)	0.31287563163368587
  (4455, 6916)	0.19636985317119715
  (4455, 4715)	0.30714144758811196
  (

Logistic Regression Model

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_features,Y_train)

**Model Evaluation**

In [None]:
#prediction on training data
y_pred = model.predict(X_train_features)
acc_train_data = accuracy_score(Y_train, y_pred)

In [None]:
print("Accuracy score on training data:", acc_train_data)

Accuracy score on training data: 0.9676912721561588


In [None]:
#prediction on testing data
y_pred_test = model.predict(X_test_features)
acc_test_data = accuracy_score(Y_test, y_pred_test)

In [None]:
print("Accuracy score on test data:", acc_test_data)

Accuracy score on test data: 0.9668161434977578


As the model performs well on the train and test data this is a good generalization

**Building a predictive system**

In [None]:
input_mail = ["XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL"]

#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making prediction
prediction = model.predict(input_data_features)
print(prediction)

if prediction[0] == 1:
  print('The mail is Ham')
else:
  print('The mail is Spam')

[0]
The mail is Spam
