In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_mail_data = pd.read_csv("combined_enron.csv")
raw_mail_data.head()

Unnamed: 0,label,label_num,text
0,ham,0,subject fw weather sites this is the list of w...
1,spam,1,subject i cant believe the changma wow wow wow...
2,spam,1,subject 4 color printing special request addit...
3,spam,1,subject nobody can beat us poloponybreakaway i...
4,ham,0,subject energy extravaganza 2 weeks away energ...


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [4]:
raw_mail_data.isnull().sum()

label        0
label_num    0
text         0
dtype: int64

In [5]:
df = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [6]:
df.isnull().sum()

label        0
label_num    0
text         0
dtype: int64

In [7]:
df.shape

(33702, 3)

In [8]:
df = df.drop(columns=["label_num"])

In [9]:
df.head()

Unnamed: 0,label,text
0,ham,subject fw weather sites this is the list of w...
1,spam,subject i cant believe the changma wow wow wow...
2,spam,subject 4 color printing special request addit...
3,spam,subject nobody can beat us poloponybreakaway i...
4,ham,subject energy extravaganza 2 weeks away energ...


In [10]:
df['label'] = df['label'].map({'spam':0, 'ham':1})

In [11]:
df.head()

Unnamed: 0,label,text
0,1,subject fw weather sites this is the list of w...
1,0,subject i cant believe the changma wow wow wow...
2,0,subject 4 color printing special request addit...
3,0,subject nobody can beat us poloponybreakaway i...
4,1,subject energy extravaganza 2 weeks away energ...


In [12]:
# loc function

In [13]:
df.loc[df['label'] == 'spam', 'label',] = 0
df.loc[df['label'] == 'ham', 'label',] = 1

In [14]:
df.head()

Unnamed: 0,label,text
0,1,subject fw weather sites this is the list of w...
1,0,subject i cant believe the changma wow wow wow...
2,0,subject 4 color printing special request addit...
3,0,subject nobody can beat us poloponybreakaway i...
4,1,subject energy extravaganza 2 weeks away energ...


In [15]:
X = df['text']
Y = df['label']

In [16]:
X

0        subject fw weather sites this is the list of w...
1        subject i cant believe the changma wow wow wow...
2        subject 4 color printing special request addit...
3        subject nobody can beat us poloponybreakaway i...
4        subject energy extravaganza 2 weeks away energ...
                               ...                        
33697    subject enron mentions enron jolt investments ...
33698    subject hpl meter 981295 hungerford grain comp...
33699    subject re expenses tammie has the receipts fr...
33700    subject comparison report ciearance bundle 1 w...
33701    subject some after thoughts to our meeting hi ...
Name: text, Length: 33702, dtype: object

In [17]:
Y

0        1
1        0
2        0
3        0
4        1
        ..
33697    1
33698    1
33699    1
33700    0
33701    1
Name: label, Length: 33702, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size= 0.2, random_state=42)

In [19]:
X_train.shape

(26961,)

In [20]:
X_test.shape

(6741,)

In [21]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", binary=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33702 entries, 0 to 33701
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   33702 non-null  int64 
 1   text    33702 non-null  object
dtypes: int64(1), object(1)
memory usage: 526.7+ KB


In [23]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [24]:
X_train

26498    subject bosses day hey kids why don t we pool ...
33049    subject bigger is better hello ochoa i m not s...
26305    subject h g h as seen on oprah and cnbc amazin...
18026    subject re do you know what i need e n o u g h...
5648     subject etc local event rodeo and carnival tic...
                               ...                        
16850    subject re opportunities lloyd yes i would be ...
6265     subject budget follow up after our call today ...
11284    subject schedule jerseys hello team i m going ...
860      subject officialized book listing below is a l...
15795    subject software store http infinity realoemsa...
Name: text, Length: 26961, dtype: object

In [28]:
print(X_train_features)

  (0, 120658)	0.029447991251520533
  (0, 25781)	0.2824870953923004
  (0, 39307)	0.09165376808223585
  (0, 62995)	0.15857957040980633
  (0, 74157)	0.19426879760878474
  (0, 43897)	0.09901193223124825
  (0, 99820)	0.1804281348465734
  (0, 108293)	0.1296304602773103
  (0, 47081)	0.14460773597747525
  (0, 71373)	0.2975298838920138
  (0, 89404)	0.1433271867815024
  (0, 78037)	0.08142712942603456
  (0, 139311)	0.10008632166858916
  (0, 136988)	0.1376896454297909
  (0, 83609)	0.1735639882437255
  (0, 87953)	0.1375605817115022
  (0, 43936)	0.17172730729013833
  (0, 67788)	0.1499113722098969
  (0, 123332)	0.11280625668137795
  (0, 41924)	0.14857114565420604
  (0, 134584)	0.0907117785712729
  (0, 67390)	0.11316027033032522
  (0, 108384)	0.1490745423782204
  (0, 107149)	0.2651779588265023
  (0, 56849)	0.1766342655636334
  :	:
  (26959, 131695)	0.14842963377700014
  (26959, 70291)	0.1833259348190427
  (26959, 28374)	0.17485568110185026
  (26959, 68048)	0.18810466598968373
  (26959, 56514)	0.184213

In [27]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_features, y_train)

In [30]:
prediction_train_data = model.predict(X_train_features)
accuracy_train_Data = accuracy_score(y_train, prediction_train_data)

In [36]:
print('Accuracy of train data : ', accuracy_train_Data)

Accuracy of train data :  0.9915433403805497


In [32]:
prediction_test_data = model.predict(X_test_features)
accuracy_test_Data = accuracy_score(y_test, prediction_test_data)

In [37]:
print('Accuracy of test data : ', accuracy_test_Data)

Accuracy of test data :  0.9832369084705533


In [38]:
#Building predicting system

In [42]:
input_user_mail = [
    "subject re moving roy ibasco hi vanessa you will need to fill out a churn request and forward it to the move team they will take care of everything that you put on the form i am attaching a copy of the form for your information the receiving department is responsible for doing this he will need boxes to pack his belongings ask him how many he needs he will have to do the packing the only item that moves other than his personal items is his telephone the computer chair etc belongs to the research group if you have any questions or need help please let me know thanks vanessa carranza enron 06 08 2000 04 47 pm to shirley crenshaw hou ect ect cc subject moving roy ibasco shirley i need to have roy ibasco s things moved to eb 2930 c from ebl 948 and have it charged to co rc 413 1708 please give me a call if you have any questions thanks vanessa c 3 5030"
]


input_data_feature = feature_extraction.transform(input_user_mail)
predictions = model.predict(input_data_feature)
if predictions[0] == 1:
    print("This is a Ham mail...")
else:
    print("Alert ! this is a Spam email !!!")

This is a Ham mail...


In [43]:
import pickle
pickle.dump(model, open("logistic_regression.pkl", "wb"))
pickle.dump(feature_extraction, open("feature_extraction.pkl", "wb"))