In [67]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [68]:
df = pd.read_csv(r"mail_data.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [69]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [70]:
df.groupby(df.Category).count()

Unnamed: 0_level_0,Message
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [71]:
df.loc[df['Category'] == 'spam','Category'] = 0
df.loc[df['Category'] == 'ham','Category'] = 1

In [72]:
df.Category

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [73]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [74]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [75]:
X = df.Message
Y=df.Category

In [76]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [77]:
tfidf_encode= TfidfVectorizer(max_df=1,stop_words='english',lowercase=True)
x_train_encoded = tfidf_encode.fit_transform(x_train)
x_test_encoded = tfidf_encode.transform(x_test)

In [78]:
y_train = y_train.astype(int)

In [79]:
y_train.dtype

dtype('int32')

In [80]:
y_test = y_test.astype(int)

In [81]:
model = LogisticRegression()

In [82]:
model.fit(x_train_encoded,y_train)

LogisticRegression()

In [83]:
x_train_pred = model.predict(x_train_encoded)

In [84]:
x_train_acc = accuracy_score(y_train,x_train_pred)

In [85]:
print(x_train_acc)

0.8678483284720664


In [86]:
x_test_pred = model.predict(x_test_encoded)
print(x_test_encoded)

  (2, 1754)	1.0
  (4, 3652)	1.0
  (5, 2285)	1.0
  (8, 2447)	1.0
  (10, 2967)	0.7071067811865476
  (10, 1813)	0.7071067811865476
  (15, 1684)	0.7071067811865476
  (15, 249)	0.7071067811865476
  (21, 3888)	0.7071067811865476
  (21, 3332)	0.7071067811865476
  (22, 2131)	0.5
  (22, 1872)	0.5
  (22, 367)	0.5
  (22, 82)	0.5
  (32, 3025)	1.0
  (33, 2752)	0.7071067811865476
  (33, 2504)	0.7071067811865476
  (37, 4028)	0.7071067811865476
  (37, 2144)	0.7071067811865476
  (42, 3259)	1.0
  (43, 2460)	0.5
  (43, 1717)	0.5
  (43, 1467)	0.5
  (43, 913)	0.5
  (47, 3921)	1.0
  :	:
  (1072, 3736)	1.0
  (1075, 4051)	1.0
  (1081, 893)	1.0
  (1082, 2566)	0.5773502691896257
  (1082, 1954)	0.5773502691896257
  (1082, 199)	0.5773502691896257
  (1085, 2477)	1.0
  (1086, 693)	1.0
  (1088, 920)	1.0
  (1089, 3500)	1.0
  (1090, 3326)	0.7071067811865476
  (1090, 2415)	0.7071067811865476
  (1091, 3931)	1.0
  (1093, 4)	1.0
  (1095, 3244)	1.0
  (1100, 4085)	0.7071067811865476
  (1100, 1549)	0.7071067811865476
  (1101

In [87]:
x_test_acc = accuracy_score(y_test,x_test_pred)
print(x_test_acc)

0.8582959641255605


In [102]:
ip_mail = ["FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"]

ip_enc = tfidf_encode.transform(ip_mail)


prediction = model.predict(ip_enc)
print(prediction)
if (prediction[0] == 0):
    print("Spam")
elif (prediction[0] == 1):
    print("Ham")


[1]
Ham
