Importing the Dependicies

In [169]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data Collection

In [170]:
mail_dataset = pd.read_csv('/content/drive/MyDrive/Copy of mail_data.csv')

In [171]:
mail_dataset.shape

(5572, 2)

In [172]:
mail_dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Checking for Null Values

In [173]:
print(mail_dataset.isnull().sum())

Category    0
Message     0
dtype: int64


Label Encoding


*   Replacing textual values in '*Category* ' to numerical values.







In [174]:
mail_dataset['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [175]:
#Replacing 'spam' with 0 and 'ham' with 1

mail_dataset.loc[mail_dataset['Category'] == 'spam', 'Category'] = 0
mail_dataset.loc[mail_dataset['Category'] == 'ham', 'Category'] = 1

In [176]:
mail_dataset.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [177]:
mail_dataset['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
1,4825
0,747


Separating Features and Labels

In [178]:
#separating features(message) & labels(category)
x = mail_dataset['Message']
y = mail_dataset['Category']

In [179]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [180]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


#Train-Test Split

In [181]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=y, random_state=2)

In [182]:
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4457,) (1115,)


#Feature Extraction

In [183]:
#Converting the textual data into feature vectors(numerical values) to make it compatible for the Logistic Regression Model. So, only convert the Feature part.

vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

#min_df is used if the frequency of a particular word is less than 1, then we need to ignore it. Only the words whose frequency is greater than 1 will be included.
#stop_words='engish' is used to remove the common words in both spam and ham mail. Such words include is, are, not, etc.
#lowercase='True' is used to convert all the words into lowercase letters. It is better for model processing.

In [184]:
x_train_features = vectorizer.fit_transform(x_train)
x_test_features = vectorizer.transform(x_test)

In [185]:
#Currently, the data type of the labels is object. So converting it into the datatype integer.

y_train = y_train.astype('int')
y_test = y_test.astype('int')


In [186]:
print(x_train_features)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34895 stored elements and shape (4457, 7496)>
  Coords	Values
  (0, 4768)	0.2885879313347367
  (0, 7438)	0.2996693624522654
  (0, 2262)	0.49316930861935127
  (0, 3764)	0.22046319970004669
  (0, 2823)	0.5172500796081709
  (0, 7289)	0.5172500796081709
  (1, 3317)	0.3290434493347565
  (1, 4972)	0.49481520325330874
  (1, 1558)	0.42364007209989546
  (1, 6517)	0.49481520325330874
  (1, 4136)	0.4717788963273523
  (2, 3103)	0.17628376831968728
  (2, 841)	0.26799944639874834
  (2, 4099)	0.186263215205624
  (2, 3086)	0.27449720225122765
  (2, 2136)	0.180851695270251
  (2, 3398)	0.20665621299033204
  (2, 4269)	0.2543939099135892
  (2, 3118)	0.18009671431232455
  (2, 3935)	0.3671145612703168
  (2, 3722)	0.24768901862403342
  (2, 6641)	0.20096909705626312
  (2, 1430)	0.28509060215711635
  (2, 5837)	0.1845655907506494
  (2, 4943)	0.33789703751914013
  :	:
  (4454, 841)	0.21705430485365426
  (4454, 3514)	0.17954863693268575
  (4454, 7163)	

In [187]:
print(y_train)

5426    1
4724    1
536     1
3488    1
2551    1
       ..
1697    1
422     0
4007    1
3474    1
3074    1
Name: Category, Length: 4457, dtype: int64


#Training the Model

Logistic Regression

In [188]:
model = LogisticRegression()

In [189]:
model.fit(x_train_features, y_train)

#Evaluating the Model

In [190]:
#prediction on training data
prediction_on_training_data = model.predict(x_train_features)

In [191]:
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [192]:
print('Accuracy on training data:', accuracy_on_training_data)

Accuracy on training data: 0.9672425398249944


In [193]:
prediction_on_test_data = model.predict(x_test_features)

In [194]:
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [195]:
print('Accuracy on test data:', accuracy_on_test_data)

Accuracy on test data: 0.9704035874439462


#Bulding a Predictive System

In [196]:
#Taking the input
mail_text = ["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]

#Convert textual data into feature vectors
input_data_features = vectorizer.transform(mail_text)

#making the prediction
prediction = model.predict(input_data_features)

if(prediction[0]==1):
  print('Ham Mail')
else:
  print('Spam Mail')


Spam Mail
