In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [46]:
df=pd.read_csv("mail_data.csv")

In [47]:
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [48]:
df.shape

(5572, 2)

In [49]:
data=df.where((pd.notnull(df)), '')

In [50]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [51]:
pd.notnull(df).sum()

Category    5572
Message     5572
dtype: int64

In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [53]:
data.shape

(5572, 2)

In [54]:
data.loc[data['Category'] == 'spam', 'Category',]=0
data.loc[data['Category']=='ham','Category',]=1

In [55]:
x=data['Message']
y=data['Category']

In [56]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [57]:
# 0 means spam and 1 means not spam
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [58]:
# test size means 80% data training and 20% data  testing
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=3)

In [59]:
x.shape,x_train.shape,x_test.shape

((5572,), (3900,), (1672,))

In [60]:
y.shape,y_train.shape,y_test.shape

((5572,), (3900,), (1672,))

In [76]:
feature_extraction = TfidfVectorizer(min_df = 1,stop_words='english',lowercase=True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)
y_train=y_train.astype('int')
y_test = y_test.astype('int')

In [None]:
# min_df: Minimum number of documents a word must appear in to be considered for the vocabulary. Can be set as an integer or a float.

# Example: min_df=1 means the word must appear in at least one document.

# stop_words: You can specify whether to remove common words like "the", "is", etc. by passing 'english' or providing a custom list.

# Example: stop_words='english' removes English stopwords.

# lowercase: If True, all characters are converted to lowercase before tokenization (default is True)

In [77]:
y_train

1455    1
3460    0
2493    1
3378    1
3826    1
       ..
789     0
968     1
1667    1
3321    1
1688    0
Name: Category, Length: 3900, dtype: int64

In [110]:
# print(x_train_features)

In [80]:
model=LogisticRegression()

In [81]:
model.fit(x_train_features,y_train)

In [82]:
prediction_on_training_data=model.predict(x_train_features)
accuracy_on_training_data=accuracy_score(y_train,prediction_on_training_data)

In [83]:
print("Accuracy on trainig data:",accuracy_on_training_data)

Accuracy on trainig data: 0.9661538461538461


In [84]:
print("Shape of x_train_features:", x_train_features.shape)
print("Shape of x_test_features:", x_test_features.shape)

Shape of x_train_features: (3900, 6896)
Shape of x_test_features: (1672, 6896)


In [87]:
prediction_on_testing_data=model.predict(x_test_features)
accuracy_on_testing_data=accuracy_score(y_test,prediction_on_testing_data)

In [91]:
print("accuracy_on_testing_data:",accuracy_on_testing_data) 

accuracy_on_testing_data: 0.9647129186602871


In [113]:
# input_your_mail = ["Congratulations! You've Won a $1000 gift card from walmart.Go to https://bit.ly to claim your reward."]
input_your_mail=["Hey Alex, just wanted to let you know tomorrow is an off. Thank you"]

input_data_features = feature_extraction.transform(input_your_mail)

prediction = model.predict(input_data_features)
print(prediction)

if(prediction[0]==1):
    print("Ham Mail") # Not Spam
else:
    print("Spam mail")

[1]
Ham Mail
