In [None]:
%pip install pandas
%pip install nltk
%pip install scikit-learn
%pip install sklearn

In [1]:
import pandas as pd
import nltk
import numpy as np
import math
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/faheem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Initially Viewing the Data

In [2]:
df = pd.read_csv("emails.csv")
df.head()
print(df.shape)

(5172, 3002)


## Preprocessing Pipeline

In [4]:
# Import the list of stop words from the 'stopwords' module
stopwordList = stopwords.words('english')

# Identify the columns in the DataFrame that have names matching any of the stop words
# Convert the stop word list and DataFrame columns to sets, find the intersection, and convert back to a list
columnsToDrop = list(set(stopwordList) & set(df.columns))

# Drop the identified columns from the DataFrame
# 'axis=1' specifies that the operation should be performed on columns (as opposed to rows)
df = df.drop(columnsToDrop, axis=1)

# Print the first few rows of the resulting DataFrame
df.head()

Unnamed: 0,Email No.,ect,hou,enron,com,gas,deal,meter,hpl,please,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,24,27,1,3,1,0,0,0,2,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,1,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,22,10,0,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,17,9,0,0,2,0,3,0,1,...,0,0,0,0,0,0,0,1,0,0


In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print("Train shape: ", train_df.shape)
print("Test shape: ", test_df.shape)
print("Train spam ratio: ", train_df['spam'].mean())
print("Test spam ratio: ", test_df['spam'].mean())
print(train_df.head())
print(test_df.head())

Train shape:  (4137, 2868)
Test shape:  (1035, 2868)
Train spam ratio:  0.019579405366207395
Test spam ratio:  0.010628019323671498
       Email No.  ect  hou  enron  com  gas  deal  meter  hpl  please  ...  \
3164  Email 3165    2    2      0    3    0     0      0    0       0  ...   
2067  Email 2068    1    0      0    0    0     0      0    3       0  ...   
4717  Email 4718    4    4      0    6    0     0      0    0       0  ...   
2505  Email 2506    1    0      0    0    1     0      1    0       0  ...   
2268  Email 2269    3    1      0    1    0     0      0    0       0  ...   

      connevey  jay  valued  lay  infrastructure  military  allowing  ff  dry  \
3164         0    0       0    0               0         0         0   1    0   
2067         0    0       0    0               0         0         0   0    0   
4717         0    0       0    0               0         0         0   3    0   
2505         0    0       0    0               0         0         0   0   

In [6]:
new_train_df = train_df.drop(columns=["Email No.", "Prediction"])
new_test_df = test_df.drop(columns=["Email No.", "Prediction"])
doc_freq_train = new_train_df.astype(bool).sum(axis=0)
print(doc_freq_train.sort_values(ascending=False).head(10))
print(len(doc_freq_train))
doc_freq_test = new_test_df.astype(bool).sum(axis=0)
print(doc_freq_test.sort_values(ascending=False).head(10))
print(len(doc_freq_test))

ect    4137
j      4137
u      4137
b      4137
ct     4137
e      4137
c      4137
r      4099
n      4093
l      4072
dtype: int64
2866
ect    1035
j      1035
c      1035
ct     1035
u      1035
e      1035
b      1035
r      1025
n      1025
l      1020
dtype: int64
2866


In [7]:
number_doc_train = new_train_df.shape[0]
number_doc_test = new_test_df.shape[0]
print(number_doc_test)
# calculating idf for each word
idf_train = np.log(number_doc_train / doc_freq_train)
idf_test = np.log(number_doc_test / doc_freq_test)
print(idf_train)
print(idf_test)

1035
ect               0.000000
hou               0.938780
enron             1.241825
com               0.909545
gas               1.447342
                    ...   
infrastructure    5.929831
military          5.762777
allowing          5.555137
ff                1.045653
dry               5.108850
Length: 2866, dtype: float64
ect               0.000000
hou               0.965806
enron             1.296710
com               0.892423
gas               1.440898
                    ...   
infrastructure    6.249010
military          5.555862
allowing          5.843544
ff                0.955705
dry               4.996247
Length: 2866, dtype: float64


In [10]:
# calculating tf-idf for each word
tf_idf_train = new_train_df * idf_train
tf_idf_test = new_test_df * idf_test
print(tf_idf_train)
print(tf_idf_test)

      ect      hou     enron       com       gas      deal     meter  \
3164  0.0  1.87756  0.000000  2.728636  0.000000  0.000000  0.000000   
2067  0.0  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   
4717  0.0  3.75512  0.000000  5.457272  0.000000  0.000000  0.000000   
2505  0.0  0.00000  0.000000  0.000000  1.447342  0.000000  1.796849   
2268  0.0  0.93878  0.000000  0.909545  0.000000  0.000000  0.000000   
...   ...      ...       ...       ...       ...       ...       ...   
4426  0.0  0.00000  2.483649  1.819091  0.000000  0.000000  0.000000   
466   0.0  0.00000  2.483649  0.000000  1.447342  1.468111  0.000000   
3092  0.0  0.00000  2.483649  0.000000  1.447342  0.000000  0.000000   
3772  0.0  0.00000  0.000000  2.728636  0.000000  0.000000  0.000000   
860   0.0  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000   

           hpl    please    e  ...  enhancements  connevey  jay  valued  \
3164  0.000000  0.000000  0.0  ...           0.0       0.0  

In [36]:
def vectorMagnitude(vector):
    Sum=0
    for value in vector:
        Sum+=math.pow(value,2)
    return math.sqrt(Sum)

In [40]:
tf_idf_train_array=tf_idf_train.to_numpy()
print(len(tf_idf_train_array[0]))
tf_idf_test_array=tf_idf_test.to_numpy()
print(len(tf_idf_test_array))
cosineSim=[]
index=0
newVector=tf_idf_test_array[0]*tf_idf_train_array[0]
newVector=np.nan_to_num(newVector.data)
for testVector in tf_idf_test_array:
    cosineSim.append([])
    testVector=np.nan_to_num(testVector)
    for trainVector in tf_idf_train_array:
        trainVector=np.nan_to_num(trainVector)
        dotProduct=testVector*trainVector
        sumOfDotProduct=sum(dotProduct)
        cosineSim[index].append(sumOfDotProduct/(vectorMagnitude(testVector)*vectorMagnitude(trainVector)))
    cosineSim[index]=sorted(cosineSim[index])
    cosineSim[index]=cosineSim[index][:10]
    print(cosineSim[index])
    index+=1
print(cosineSim)

2866
1035


  cosineSim[index].append(sumOfDotProduct/(vectorMagnitude(testVector)*vectorMagnitude(trainVector)))


[2.291866434596544e-05, 4.912977940408937e-05, 4.9282995465296986e-05, 5.134350500367491e-05, 6.12367390485945e-05, 6.492647297977805e-05, 7.39744808195453e-05, 0.00010514630980662224, 0.00011323084942494626, 0.0001295892655582259]
[0.0021885867621375442, 0.0027417432196163902, 0.0027587014301995757, 0.004383920528899822, 0.005255032459429904, 0.005343541647600151, 0.005346642752861974, 0.0053622299580543065, 0.0053622299580543065, 0.0053622299580543065]
[0.006926157708269927, 0.008500795077816009, 0.009611953281227834, 0.010501387920326474, 0.017721673275409716, 0.018077270154678318, 0.018077270154678318, 0.018077270154678318, 0.01833427414492495, 0.01837810193422268]
[0.0, 1.2835161468572274e-05, 3.494708572695473e-05, 3.7642701367298295e-05, 3.9298671281998747e-05, 4.402901571067458e-05, 4.4995046351992054e-05, 4.6088396142517894e-05, 5.530815394843027e-05, 6.343879835751672e-05]
[0.0021825151977608467, 0.0034720579681982154, 0.004072157111341504, 0.006374386021978951, 0.00706882255

KeyboardInterrupt: 