In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # To convert text data to numerical values so ML model can understand it
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
raw_mail_data = pd.read_csv('/content/mail_data.csv')
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
raw_mail_data.shape

(5572, 2)

In [None]:
# There are some null values, so replacing them with null strings
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Label Encoding

In [None]:
# label spam mail as 0 and ham mail as 1
mail_data.loc[mail_data['Category'] =='spam','Category',] = 0   #Replace spam with 0
mail_data.loc[mail_data['Category'] =='ham','Category',] = 1

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


spam-0

ham-1

In [None]:
# seperating data as text and labels
X = mail_data['Message']
Y = mail_data['Category']

In [None]:
# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [None]:
# Feature Extraction-
# Converting text data to meaningful numerical values

# Transforming test data to feature vector which can be used as input for regression model

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
# min_df--> If score less than 1, we ignore it. So if a word is repeated only once, we don't want it.
# stopwords--> Words that are repeated many times but don't have much meaning like is,the,are etc.
#lowercase='True'--> Convert all to lowercase for better processing

In [None]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)  # We don't fit the vectorizer for test data bcoz we don't want the model to look at the test data

# The labels are strings right now, so we need to convert them to numbers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
X_train_features

<4457x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 34775 stored elements in Compressed Sparse Row format>

In [None]:
# Training the model
model = LogisticRegression()
model.fit(X_train_features,Y_train)

In [None]:
# Evaluating the model
# On training the data 
train_pred = model.predict(X_train_features)
accuracy_on_training = accuracy_score(train_pred,Y_train)
print("Accuracy on Training Data: ",accuracy_on_training)

Accuracy on Training Data:  0.9670181736594121


In [None]:
# On testing data
test_pred = model.predict(X_test_features)
accuracy_on_test = accuracy_score(test_pred,Y_test)
print("Accuracy on Testing Data: ",accuracy_on_test)

Accuracy on Testing Data:  0.9659192825112107


In [None]:
input_mail=["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]
input_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_features)
print(prediction)

[0]
