In [2]:
import numpy as np 
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 

In [3]:

raw_mail_data=pd.read_csv('mail_data.csv')
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#replace the null values with null string
mail_data= raw_mail_data.where((pd.notnull(raw_mail_data)),'')
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
mail_data.shape

(5572, 2)

In [6]:
#labeling spam mail as 0 and ham mail as 1:

mail_data.loc[mail_data['Category']=='spam','Category',]=0
mail_data.loc[mail_data['Category']=='ham','Category',]=1


In [7]:
#seperating the values into target and the training values
X=mail_data['Message']
Y=mail_data['Category']

In [8]:
#test and train split 

X_train,X_test,Y_train,Y_test= train_test_split(X,Y,random_state=2,test_size=0.2)

In [9]:
X_train.shape,X_test.shape, X_test.shape,Y_test.shape

((4457,), (1115,), (1115,), (1115,))

In [10]:
#Features extraction step 
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

#Convert Y_train and Y_test as integers
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')


In [11]:
print(X_train,X_train_features)

3890                    Unlimited texts. Limited minutes.
5553                          Hahaha..use your brain dear
4366    Ujhhhhhhh computer shipped out with address to...
3968    YOU HAVE WON! As a valued Vodafone customer ou...
3771    Love it! The girls at the office may wonder wh...
                              ...                        
3335    That's fine, have him give me a call if he kno...
1099    NO GIFTS!! You trying to get me to throw mysel...
2514    U have won a nokia 6230 plus a free digital ca...
3606                      Jordan got voted out last nite!
2575    Your next amazing xxx PICSFREE1 video will be ...
Name: Message, Length: 4457, dtype: object <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34768 stored elements and shape (4457, 7458)>
  Coords	Values
  (0, 6927)	0.48935591439341625
  (0, 6586)	0.44333254982109394
  (0, 3958)	0.6161071828926097
  (0, 4334)	0.42941702167641554
  (1, 3168)	0.5869421390016224
  (1, 6971)	0.4281243465155688
  (1, 

***TRAINING THE MODEL USING LOGISTIC REGRESSION**

In [12]:
model=LogisticRegression()

In [13]:
history=model.fit(X_train_features,Y_train)

In [14]:
#evaluating the trained model 
trained_data_prediction=model.predict(X_train_features)
trained_data_accuracy=accuracy_score(Y_train,trained_data_prediction)
print(f"the accuracy on the trained data is:{trained_data_accuracy} ")

the accuracy on the trained data is:0.9685887368184878 


In [15]:
#evaluating the trained model 
trained_data_prediction=model.predict(X_test_features)
trained_data_accuracy=accuracy_score(Y_test,trained_data_prediction)
print(f"the accuracy on the trained data is:{trained_data_accuracy} ")

the accuracy on the trained data is:0.9533632286995516 


Building a predictive system

In [21]:
input_mail=["You have been selected as the lucky winner of  in our annual lottery draw. To claim your prize, "]
input_mail_features=feature_extraction.transform(input_mail)

#making prediction
prediction=model.predict(input_mail_features)
print(prediction)

if prediction[0]:
    print("Ham mail, dont worry")
else :
    print("Spam mail, Be careful")

[0]
Spam mail, Be careful
