### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

### Reading initial raw data

In [2]:
df = pd.read_csv('data/spam_ham_enron_dataset.csv')

In [3]:
df

Unnamed: 0,Message_ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33716 entries, 0 to 33715
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Message_ID  33716 non-null  int64 
 1   Subject     33427 non-null  object
 2   Message     33345 non-null  object
 3   Spam/Ham    33716 non-null  object
 4   Date        33716 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


### Dropping unnecessary columns

In [5]:
df.drop(columns=["Message_ID", "Date"], inplace=True)
df

Unnamed: 0,Subject,Message,Spam/Ham
0,christmas tree farm pictures,,ham
1,"vastar resources , inc .","gary , production from the high island larger ...",ham
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham
3,re : issue,fyi - see note below - already done .\nstella\...,ham
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham
...,...,...,...
33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam
33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam
33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam
33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam


### Label encoding target labels
#### 0 : Not Spam
#### 1 : Spam

In [6]:
df.value_counts("Spam/Ham")

Spam/Ham
spam    17171
ham     16545
dtype: int64

In [7]:
label_encoding_dict = {'spam': 1, 'ham': 0}

In [8]:
df['label'] = df['Spam/Ham'].map(label_encoding_dict)

In [9]:
df

Unnamed: 0,Subject,Message,Spam/Ham,label
0,christmas tree farm pictures,,ham,0
1,"vastar resources , inc .","gary , production from the high island larger ...",ham,0
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,0
3,re : issue,fyi - see note below - already done .\nstella\...,ham,0
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,0
...,...,...,...,...
33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,1
33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,1
33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,1
33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,1


In [10]:
df.value_counts('label')

label
1    17171
0    16545
dtype: int64

In [11]:
df.drop(columns=['Spam/Ham'], inplace=True)
df

Unnamed: 0,Subject,Message,label
0,christmas tree farm pictures,,0
1,"vastar resources , inc .","gary , production from the high island larger ...",0
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0
3,re : issue,fyi - see note below - already done .\nstella\...,0
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
...,...,...,...
33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1
33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1
33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1
33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1


### Checking for NaN values in Subject and Message

In [12]:
df.Message.isna().sum()

371

In [13]:
df.Subject.isna().sum()

289

### Dropping rows where both Subject and Message is NaN

In [14]:
df.dropna(subset=['Subject', 'Message'], how='all', inplace=True)
df

Unnamed: 0,Subject,Message,label
0,christmas tree farm pictures,,0
1,"vastar resources , inc .","gary , production from the high island larger ...",0
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0
3,re : issue,fyi - see note below - already done .\nstella\...,0
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
...,...,...,...
33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1
33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1
33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1
33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1


In [15]:
df.Message.isna().sum(), df.Subject.isna().sum()

(320, 238)

### NaN value for Subject replaced with "(no_subject)"
### NaN value for Message replaced with "(no_message_text)"

In [16]:
df_new = df.copy()
df_new

Unnamed: 0,Subject,Message,label
0,christmas tree farm pictures,,0
1,"vastar resources , inc .","gary , production from the high island larger ...",0
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0
3,re : issue,fyi - see note below - already done .\nstella\...,0
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
...,...,...,...
33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1
33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1
33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1
33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1


In [17]:
df_new['Subject'].fillna("(no_subject)", inplace=True)
df_new

Unnamed: 0,Subject,Message,label
0,christmas tree farm pictures,,0
1,"vastar resources , inc .","gary , production from the high island larger ...",0
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0
3,re : issue,fyi - see note below - already done .\nstella\...,0
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
...,...,...,...
33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1
33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1
33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1
33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1


In [18]:
df_new.Message.isna().sum(), df_new.Subject.isna().sum()

(320, 0)

In [19]:
df_new['Message'].fillna("(no_message_text)", inplace=True)
df_new

Unnamed: 0,Subject,Message,label
0,christmas tree farm pictures,(no_message_text),0
1,"vastar resources , inc .","gary , production from the high island larger ...",0
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0
3,re : issue,fyi - see note below - already done .\nstella\...,0
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
...,...,...,...
33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1
33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1
33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1
33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1


In [20]:
df_new.Message.isna().sum(), df_new.Subject.isna().sum()

(0, 0)

### Resetting index

In [21]:
df_new.reset_index(inplace=True)
df_new

Unnamed: 0,index,Subject,Message,label
0,0,christmas tree farm pictures,(no_message_text),0
1,1,"vastar resources , inc .","gary , production from the high island larger ...",0
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0
3,3,re : issue,fyi - see note below - already done .\nstella\...,0
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
...,...,...,...,...
33660,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1
33661,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1
33662,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1
33663,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1


In [22]:
df_new.drop(columns=['index'], inplace=True)
df_new

Unnamed: 0,Subject,Message,label
0,christmas tree farm pictures,(no_message_text),0
1,"vastar resources , inc .","gary , production from the high island larger ...",0
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0
3,re : issue,fyi - see note below - already done .\nstella\...,0
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
...,...,...,...
33660,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1
33661,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1
33662,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1
33663,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1


### Exporting DataFrame to CSV

In [23]:
df_new.to_csv("data/enron_spam_ham_email_clean.csv", index=False)