In [261]:
import pandas as pd
from sklearn import preprocessing

In [262]:
enron = pd.read_csv("...Datasets\\enron_spam_data.csv")
enron

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


In [263]:
# We will only be analysing the message of the body, so we don't need all the other columns
enron.drop(['Message ID', "Subject", "Date"], axis = 1, inplace = True)
enron.head()

Unnamed: 0,Message,Spam/Ham
0,,ham
1,"gary , production from the high island larger ...",ham
2,- calpine daily gas nomination 1 . doc,ham
3,fyi - see note below - already done .\nstella\...,ham
4,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham


In [264]:
# Checking for missing values
enron.isna().sum()

Message     371
Spam/Ham      0
dtype: int64

In [265]:
# Removing missing values
enron.dropna(inplace = True)
enron.shape

(33345, 2)

In [266]:
# Checking for duplicate values
enron.duplicated().sum()

3565

In [267]:
# Removing duplicate values
enron.drop_duplicates(inplace = True)
enron.shape

(29780, 2)

In [268]:
# Replacing ham and spam with 0 and 1 respectively
label_encoder = preprocessing.LabelEncoder() 
enron['label']= label_encoder.fit_transform(enron['Spam/Ham'])
print(enron.head())
enron.drop('Spam/Ham', axis=1, inplace=True)

# Renaming column name for merging
enron.rename(columns={'Message': 'text'}, inplace = True)
enron

                                             Message Spam/Ham  label
1  gary , production from the high island larger ...      ham      0
2             - calpine daily gas nomination 1 . doc      ham      0
3  fyi - see note below - already done .\nstella\...      ham      0
4  fyi .\n- - - - - - - - - - - - - - - - - - - -...      ham      0
5  jackie ,\nsince the inlet to 3 river plant is ...      ham      0


Unnamed: 0,text,label
1,"gary , production from the high island larger ...",0
2,- calpine daily gas nomination 1 . doc,0
3,fyi - see note below - already done .\nstella\...,0
4,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
5,"jackie ,\nsince the inlet to 3 river plant is ...",0
...,...,...
33709,"hello ,\ndid you ejaculate before or within a ...",1
33711,"hello , welcome to gigapharm onlinne shop .\np...",1
33712,i got it earlier than expected and it was wrap...,1
33713,are you ready to rock on ? let the man in you ...,1


In [269]:
trec = pd.read_csv("...Datasets\\email_text.csv")
trec

Unnamed: 0,label,text
0,1,do you feel the pressure to perform and not ri...
1,0,hi i've just updated from the gulus and i chec...
2,1,mega authenticv i a g r a discount pricec i a ...
3,1,hey billy it was really fun going out the othe...
4,1,system of the home it will have the capabiliti...
...,...,...
53663,1,versuchen sie unser produkt und sie werden fuh...
53664,1,while we may have high expectations of our ass...
53665,0,for those who are interested i just cook a lit...
53666,0,hello as i wrote i call sqlfetch channel t stu...


In [270]:
# Checking for missing values
trec.isna().sum()

label    0
text     0
dtype: int64

In [271]:
# Checking for duplicate values
trec.duplicated().sum()

0

In [272]:
combined = pd.concat([trec, enron], ignore_index = True)
combined

Unnamed: 0,label,text
0,1,do you feel the pressure to perform and not ri...
1,0,hi i've just updated from the gulus and i chec...
2,1,mega authenticv i a g r a discount pricec i a ...
3,1,hey billy it was really fun going out the othe...
4,1,system of the home it will have the capabiliti...
...,...,...
83443,1,"hello ,\ndid you ejaculate before or within a ..."
83444,1,"hello , welcome to gigapharm onlinne shop .\np..."
83445,1,i got it earlier than expected and it was wrap...
83446,1,are you ready to rock on ? let the man in you ...


In [273]:
# Checking for duplicates after combining
combined.duplicated().sum()

0

In [274]:
# Shuffling the data
combined = combined.sample(frac=1, random_state = 1)
combined

Unnamed: 0,label,text
48104,1,ounce feather bowl hummingbird opec moment ala...
41270,1,wulvob get your medircations online qnb ikud v...
17204,0,computer connection from cnn com wednesday es...
33996,1,university degree obtain a prosperous future m...
29591,0,thanks for all your answers guys i know i shou...
...,...,...
21440,0,hi given a date how do i get the last date of ...
73349,1,now you can order software on cd or download i...
50057,1,dear valued member canadianpharmacy provides a...
5192,0,subscribe change profile contact us long term ...


In [275]:
# Saving the combined dataframe as a CSV file
combined.to_csv(r"...Datasets\\combined_data.csv", index=False)