In [None]:
%pip install pandas
%pip install nltk
%pip install scikit-learn
%pip install sklearn

In [1]:
import pandas as pd
import nltk
import numpy as np
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Initially Viewing the Data

In [9]:
df = pd.read_csv("emails.csv")
df.head()
print(df.shape)

(5172, 3002)
ect
1      2587
2       875
3       351
4       229
5       161
       ... 
344       1
81        1
101       1
194       1
85        1
Name: count, Length: 90, dtype: int64


## Preprocessing Pipeline

In [3]:
# Import the list of stop words from the 'stopwords' module
stopwordList = stopwords.words('english')

# Identify the columns in the DataFrame that have names matching any of the stop words
# Convert the stop word list and DataFrame columns to sets, find the intersection, and convert back to a list
columnsToDrop = list(set(stopwordList) & set(df.columns))

# Drop the identified columns from the DataFrame
# 'axis=1' specifies that the operation should be performed on columns (as opposed to rows)
df = df.drop(columnsToDrop, axis=1)

# Print the first few rows of the resulting DataFrame
df.head()

Unnamed: 0,Email No.,ect,hou,enron,com,gas,deal,meter,hpl,please,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,24,27,1,3,1,0,0,0,2,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,1,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,22,10,0,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,17,9,0,0,2,0,3,0,1,...,0,0,0,0,0,0,0,1,0,0


In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print("Train shape: ", train_df.shape)
print("Test shape: ", test_df.shape)
print("Train spam ratio: ", train_df['spam'].mean())
print("Test spam ratio: ", test_df['spam'].mean())
print(train_df.head())
print(test_df.head())

Train shape:  (4137, 2868)
Test shape:  (1035, 2868)
Train spam ratio:  0.019579405366207395
Test spam ratio:  0.010628019323671498
       Email No.  ect  hou  enron  com  gas  deal  meter  hpl  please  ...   
3164  Email 3165    2    2      0    3    0     0      0    0       0  ...  \
2067  Email 2068    1    0      0    0    0     0      0    3       0  ...   
4717  Email 4718    4    4      0    6    0     0      0    0       0  ...   
2505  Email 2506    1    0      0    0    1     0      1    0       0  ...   
2268  Email 2269    3    1      0    1    0     0      0    0       0  ...   

      connevey  jay  valued  lay  infrastructure  military  allowing  ff  dry   
3164         0    0       0    0               0         0         0   1    0  \
2067         0    0       0    0               0         0         0   0    0   
4717         0    0       0    0               0         0         0   3    0   
2505         0    0       0    0               0         0         0   0   

In [14]:
new_train_df = train_df.drop(columns=["Email No.", "Prediction"])
new_test_df = test_df.drop(columns=["Email No.", "Prediction"])
doc_freq = new_train_df.astype(bool).sum(axis=0)
print(doc_freq.sort_values(ascending=False).head(10))
print(len(doc_freq))

ect    4137
j      4137
u      4137
b      4137
ct     4137
e      4137
c      4137
r      4099
n      4093
l      4072
dtype: int64
2866


In [21]:
number_doc = new_train_df.shape[0]
print(number_doc)
# calculating idf for each word
idf_train = np.log(number_doc / doc_freq)
print(idf_train)



4137
ect               0.000000
hou               0.938780
enron             1.241825
com               0.909545
gas               1.447342
                    ...   
infrastructure    5.929831
military          5.762777
allowing          5.555137
ff                1.045653
dry               5.108850
Length: 2866, dtype: float64


In [25]:
# calculating tf-idf for each word
tf_idf_train = new_train_df * idf_train
print(tf_idf_train)

siuuuuuuuu
ect                 0.000000
hou               146.449691
enron             181.306407
com               660.329920
gas                41.972920
                     ...    
infrastructure     17.789493
military           23.051107
allowing           16.665412
ff                119.204386
dry                20.435401
Length: 2866, dtype: float64
