In [1]:
import pandas as pd
import numpy as np


In [2]:
#Importing Logistic Regression Model [Since only two categories (Binary)] we use Logisitc Regression
from sklearn.linear_model import LogisticRegression

In [3]:
#Importing Accuracy Score Function
from sklearn.metrics import accuracy_score


In [4]:
#Importing Train test split function
from sklearn.model_selection import train_test_split

In [5]:
# To convert all the strings into numeric formats
# For the machine to understand
from sklearn.feature_extraction.text import TfidfVectorizer


In [6]:
# Importing the Dataset
raw_mail_data = pd.read_csv("mail_data.csv")
raw_mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
#Checking the Null values
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [8]:
#replacing all the null values with the null string
# Even spaces are null values hence those are converted
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [9]:
#Printing the first 5 rows
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#Printing the last 5 rows
raw_mail_data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [11]:
# No of rows & columns of the dataset
raw_mail_data.shape

(5572, 2)

In [12]:
# Getting the info of the dataset
raw_mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [13]:
#Statistial measure of the data
raw_mail_data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [14]:
mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [15]:
#label spam email as 0 and ham email as 1
# For the macine to understan what is ham & spam
raw_mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1
raw_mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0


In [16]:
#find the total no ham and spam email
mail_data['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [17]:
#separating the data as input feature and target column
X = mail_data['Message'] #input feature
Y = mail_data['Category'] #target
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [18]:
print(Y)
# For ham = 1
# For spam = 0

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5572, dtype: object


In [19]:
#spliting the data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2)


In [20]:
#transforming the text data to feature vectors that can be used as input in logistic regression
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
# min_df - minimun document frequency
# stop_words - english words

In [21]:
X_train_num = feature_extraction.fit_transform(X_train)

In [22]:
X_test_num = feature_extraction.transform(X_test) # Since not ftting data in X_test_num not used fit

In [23]:
#@title  Feature Extraction
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_num = feature_extraction.fit_transform(X_train)

X_test_num = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')

Y_test = Y_test.astype('int')


ValueError: invalid literal for int() with base 10: 'ham'

In [24]:
#@title Logistic Regression
model = LogisticRegression()

In [25]:
#@title Training the Model
model = LogisticRegression()

In [26]:
# Training the Logistic regression model with the training data
model.fit(X_train_num , Y_train)

In [27]:
#@title Evaluating the Training Model

In [28]:
#Prediction on training data
X_train_prediction = model.predict(X_train_num)

In [29]:
#Accuracy score for training data
training_data_accuracy = accuracy_score(Y_train , X_train_prediction)
print("Accuracy Score of training data ",training_data_accuracy)

Accuracy Score of training data  0.9683643706529056


In [30]:
X_test_prediction = model.predict(X_test_num)


In [31]:
#Accuracy score for testing data
testing_data_accuracy = accuracy_score(Y_test , X_test_prediction)
print("Accuracy Score of testing data ",testing_data_accuracy)

Accuracy Score of testing data  0.9524663677130045


In [32]:
#@title Building a predictive System
#Since accuracy score is > 90

In [33]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors

input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)

print(prediction)

if (prediction == 1):

  print('Ham mail')

else:

  print('Spam mail')






['ham']
Spam mail


In [34]:
input_mail = ["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]

# convert text to feature vectors

input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)

print(prediction)



if (prediction[0]==1):

  print('Ham mail')

else:

  print('Spam mail')






['spam']
Spam mail


In [35]:
input_data = ['Ahhh. Work. I vaguely remember that! What does it feel like? Lol']
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshape = input_data_as_numpy_array.reshape(1,-1)


In [36]:
# Drafted own message to see the output
# You can give any data & it'll give the output
# it recognizes as spam only if it contains something as free free free or currency or jackpot in short spam words
input_mail = ["Hello sir , You are selected for free trial of Car & if you are lucky enough you can win a car also for free free free "]

# convert text to feature vectors

input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)

if (prediction[0]==1):
  print('Ham mail')
else:
  print('Spam mail')

['spam']
Spam mail


In [None]:
input_mail = ["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]

# convert text to feature vectors

input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)

print(prediction)



if (prediction[0]==1):

  print('Ham mail')

else:

  print('Spam mail')




