#### Import dependecies

In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Load the dataset

In [90]:
df = pd.read_csv("mail_data.csv")

In [91]:
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Data Pre-processing to remove NA values and outliers

In [92]:
### Since the input data is a text data, we need to replace NA values with "NULL" string or an empty string
df_mail_data = df.where((pd.notnull(df)), '')

In [93]:
df_mail_data.shape

(5572, 2)

## Label Encoding - Changing all the text to numeric values

In [94]:
## Label category column to numeric like for "ham" == 1, "spam" == 0 
df_mail_data.loc[df_mail_data['Category'] == "spam", 'Category', ] = 0
df_mail_data.loc[df_mail_data['Category'] == "ham", 'Category', ] = 1

In [95]:
df_mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


### Spilt the dataset to features and target whereas features correspond to values for training the model and target values corresponds to the labels

In [96]:
x = df_mail_data['Message']

y = df_mail_data['Category']

#### Train_Test_Split

In [97]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=3) #random_state=3 is always to split the data in the same way

In [98]:
display(x.shape, X_train.shape, y.shape, Y_train.shape)

(5572,)

(4457,)

(5572,)

(4457,)

#### Transform the text data to feature vectors that can be used as an input to the Logistic Regression model

In [99]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase='true')

In [100]:
feature_extraction

TfidfVectorizer(lowercase='true', stop_words='english')

In [101]:
x_train_features = feature_extraction.fit_transform(X_train)
x_test_features = feature_extraction.transform(X_test)

In [102]:
print(x_test_features.size)

7687


In [103]:
print(x_test_features)

  (0, 7271)	0.1940327008179069
  (0, 6920)	0.20571591693537986
  (0, 5373)	0.2365698724638063
  (0, 5213)	0.1988547357502182
  (0, 4386)	0.18353336340308998
  (0, 1549)	0.2646498848307188
  (0, 1405)	0.3176863938914351
  (0, 1361)	0.25132445289897426
  (0, 1082)	0.2451068436245027
  (0, 1041)	0.28016206931555726
  (0, 405)	0.2381316303003606
  (0, 306)	0.23975986557206702
  (0, 20)	0.30668032384591537
  (0, 14)	0.26797874471323896
  (0, 9)	0.2852706805264544
  (0, 1)	0.2381316303003606
  (1, 7368)	0.29957800964520975
  (1, 6732)	0.42473488678029325
  (1, 6588)	0.3298937975962767
  (1, 6507)	0.26731535902873493
  (1, 6214)	0.3621564482127515
  (1, 4729)	0.22965776503163893
  (1, 4418)	0.3457696891316818
  (1, 3491)	0.496093956101028
  (2, 7205)	0.22341717215670331
  :	:
  (1110, 3167)	0.5718357066163949
  (1111, 7353)	0.4991205841293424
  (1111, 6787)	0.40050175714278885
  (1111, 6033)	0.4714849709283488
  (1111, 3227)	0.44384935772735523
  (1111, 2440)	0.4137350055985486
  (1112, 7071)

#### it is important to have Y_train and Y_test in labels in integer forms, so check the datatype, otherwise typecast them to integers

In [104]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

### Train the ML Model - Logistic Regression

In [105]:
## Call the instance of a logistic regression function
model = LogisticRegression()

In [106]:
## Train the model using training dataset
model.fit(x_train_features, Y_train)

LogisticRegression()

## Evaluating the trained model

In [107]:
## Prediction on the training data

prediction_on_tarining_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_tarining_data)

In [108]:
display(f"Accuracy on training data : {accuracy_on_training_data}")

'Accuracy on training data : 0.9670181736594121'

In [109]:
## Prediction on the test data

prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [110]:
display(f"Accuracy on test data : {accuracy_on_test_data}")

'Accuracy on test data : 0.9659192825112107'

## Building a predictive system

In [141]:
### Select a random input

input_data_new = ["Your message couldn't be delivered to dev.admissiongyan@gmail.com. Their inbox is full, or it's getting too much mail right now."]

In [142]:
## Convert text to feature vectors
input_data_features_new = feature_extraction.transform(input_data_new)

In [143]:
#print(input_data_features)

In [144]:
prediction = model.predict(input_data_features_new)

print(prediction)


if (prediction[0] == 1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail
