In [None]:
%pip install pandas
%pip install nltk
%pip install scikit-learn
%pip install sklearn

In [18]:
import pandas as pd
import nltk
import numpy as np
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to C:\Users\Dell Latitude
[nltk_data]     E5470\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Initially Viewing the Data

In [7]:
df = pd.read_csv("emails.csv")
df.head()
print(df.shape)


(5172, 3002)


## Preprocessing Pipeline

In [8]:
# Import the list of stop words from the 'stopwords' module
stopwordList = stopwords.words('english')

# Identify the columns in the DataFrame that have names matching any of the stop words
# Convert the stop word list and DataFrame columns to sets, find the intersection, and convert back to a list
columnsToDrop = list(set(stopwordList) & set(df.columns))

# Drop the identified columns from the DataFrame
# 'axis=1' specifies that the operation should be performed on columns (as opposed to rows)
df = df.drop(columnsToDrop, axis=1)

# Print the first few rows of the resulting DataFrame
df.head()

Unnamed: 0,Email No.,ect,hou,enron,com,gas,deal,meter,hpl,please,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,24,27,1,3,1,0,0,0,2,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,1,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,22,10,0,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,17,9,0,0,2,0,3,0,1,...,0,0,0,0,0,0,0,1,0,0


In [114]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=69)
print("Train shape: ", train_df.shape)
print("Test shape: ", test_df.shape)
print("Train spam ratio: ", train_df['spam'].mean())
print("Test spam ratio: ", test_df['spam'].mean())
print(train_df.head())
print(test_df.head())

Train shape:  (4137, 2868)
Test shape:  (1035, 2868)
Train spam ratio:  0.016678752719361856
Test spam ratio:  0.022222222222222223
       Email No.  ect  hou  enron  com  gas  deal  meter  hpl  please  ...  \
4591  Email 4592    1    0      3    2    0     0      0    0       1  ...   
4537  Email 4538    1    3      0    1    0     0      0    0       0  ...   
747    Email 748    9    5      7    0    0     3      0    0       1  ...   
4639  Email 4640    1    1      0    0    0     0      0    0       1  ...   
3397  Email 3398    2    0      0    1    0     0      0    0       0  ...   

      connevey  jay  valued  lay  infrastructure  military  allowing  ff  dry  \
4591         0    0       0    0               0         0         0   0    0   
4537         0    0       0    0               0         0         0   1    0   
747          0    0       0    0               0         0         0   0    0   
4639         0    0       0    0               0         0         0   0   

In [115]:
new_train_df = train_df.drop(columns="Email No.")
new_test_df = test_df.drop(columns="Email No.")
train_df_tf = new_train_df.apply(lambda row: row / row.max(), axis=1)
test_df_tf = new_test_df.apply(lambda row: row / row.max(), axis=1)
doc_freq_train = new_train_df.sum(axis=0)
doc_freq_test = new_test_df.sum(axis=0)
print(doc_freq)
print(doc_freq2)

ect           21054
hou            8164
enron          5335
com            7468
gas            2559
              ...  
military         25
allowing         18
ff             3797
dry              31
Prediction     1182
Length: 2867, dtype: int64
ect           5550
hou           2306
enron         1571
com           1672
gas            634
              ... 
military         9
allowing         3
ff             934
dry              5
Prediction     318
Length: 2867, dtype: int64


In [116]:
number_doc = train_df_tf.shape[0]
# calculating idf for each word
idf_train = np.log(number_doc / doc_freq_train)
idf_test = np.log(number_doc / doc_freq_test)
print(idf_train)
print(idf_test)



ect          -1.627120
hou          -0.679763
enron        -0.254318
com          -0.590656
gas           0.480354
                ...   
military      5.108850
allowing      5.437354
ff            0.085760
dry           4.893739
Prediction    1.252763
Length: 2867, dtype: float64
ect          -0.293827
hou           0.584456
enron         0.968259
com           0.905950
gas           1.875677
                ...   
military      6.130502
allowing      7.229114
ff            1.488250
dry           6.718288
Prediction    2.565675
Length: 2867, dtype: float64


In [117]:
# calculating tf-idf for each word
tf_idf_train = train_df_tf * idf_train
tf_idf_test = test_df_tf * idf_test
print(tf_idf_train)
print(tf_idf_test)

           ect       hou     enron       com  gas      deal  meter       hpl  \
4591 -0.031904 -0.000000 -0.014960 -0.023163  0.0  0.000000    0.0  0.000000   
4537 -0.011540 -0.014463 -0.000000 -0.004189  0.0  0.000000    0.0  0.000000   
747  -0.142176 -0.032998 -0.017284 -0.000000  0.0  0.008688    0.0  0.000000   
4639 -0.108475 -0.045318 -0.000000 -0.000000  0.0  0.000000    0.0  0.000000   
3397 -0.041721 -0.000000 -0.000000 -0.007573  0.0  0.000000    0.0  0.000000   
...        ...       ...       ...       ...  ...       ...    ...       ...   
3633 -0.232446 -0.000000 -0.000000 -0.000000  0.0  0.000000    0.0  0.195116   
439  -0.070744 -0.000000 -0.000000 -0.025681  0.0  0.000000    0.0  0.019794   
1626 -0.191426 -0.013329 -0.009973 -0.000000  0.0  0.000000    0.0  0.000000   
2667 -0.542373 -0.000000 -0.000000 -0.000000  0.0  0.000000    0.0  0.000000   
4041 -0.203390 -0.000000 -0.000000 -0.000000  0.0  0.000000    0.0  0.000000   

        please         e  ...  connevey

In [118]:
# length of each vector
length_train = np.sqrt(np.sum(tf_idf_train ** 2, axis=1))
length_test = np.sqrt(np.sum(tf_idf_test ** 2, axis=1))
print(length_train)
print(length_test)

4591    5.975207
4537    5.626328
747     6.033688
4639    5.895398
3397    5.666389
          ...   
3633    7.000643
439     6.757856
1626    5.710939
2667    6.278256
4041    5.501644
Length: 4137, dtype: float64
3514    3.737566
3725    3.988715
442     4.042201
1316    4.227459
5086    5.092304
          ...   
2074    4.947256
0       5.334521
3753    3.980297
4445    3.986752
3696    3.951620
Length: 1035, dtype: float64


In [119]:
#  normalizing tf-idf
tf_idf_train = tf_idf_train / length_train[:, np.newaxis]
tf_idf_test = tf_idf_test / length_test[:, np.newaxis]
print(tf_idf_train)
print(tf_idf_test)

           ect       hou     enron       com  gas     deal  meter       hpl  \
4591 -0.005339 -0.000000 -0.002504 -0.003877  0.0  0.00000    0.0  0.000000   
4537 -0.002051 -0.002571 -0.000000 -0.000745  0.0  0.00000    0.0  0.000000   
747  -0.023564 -0.005469 -0.002865 -0.000000  0.0  0.00144    0.0  0.000000   
4639 -0.018400 -0.007687 -0.000000 -0.000000  0.0  0.00000    0.0  0.000000   
3397 -0.007363 -0.000000 -0.000000 -0.001336  0.0  0.00000    0.0  0.000000   
...        ...       ...       ...       ...  ...      ...    ...       ...   
3633 -0.033203 -0.000000 -0.000000 -0.000000  0.0  0.00000    0.0  0.027871   
439  -0.010468 -0.000000 -0.000000 -0.003800  0.0  0.00000    0.0  0.002929   
1626 -0.033519 -0.002334 -0.001746 -0.000000  0.0  0.00000    0.0  0.000000   
2667 -0.086389 -0.000000 -0.000000 -0.000000  0.0  0.00000    0.0  0.000000   
4041 -0.036969 -0.000000 -0.000000 -0.000000  0.0  0.00000    0.0  0.000000   

        please         e  ...  connevey  jay  value

  tf_idf_train = tf_idf_train / length_train[:, np.newaxis]
  tf_idf_test = tf_idf_test / length_test[:, np.newaxis]


In [113]:
# calculate cosine similarity
cosine_similarity_train = np.dot(tf_idf_train, tf_idf_train.T)
cosine_similarity_test = np.dot(tf_idf_test, tf_idf_test.T)
print(cosine_similarity_train)
# here the cosine similarity is becoming nan for some reason 
print(cosine_similarity_test)

[[35.70310049 32.86078746 35.31351655 ... 33.03715742 30.37120052
  29.38610958]
 [32.86078746 31.65557094 33.2194943  ... 31.1996897  29.42434473
  28.21877557]
 [35.31351655 33.2194943  36.40539114 ... 33.2689734  31.31255408
  30.26072327]
 ...
 [33.03715742 31.1996897  33.2689734  ... 32.61482791 29.53718207
  28.70803462]
 [30.37120052 29.42434473 31.31255408 ... 29.53718207 39.41650278
  29.42456482]
 [29.38610958 28.21877557 30.26072327 ... 28.70803462 29.42456482
  30.26808985]]
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]


ModuleNotFoundError: No module named 'sklearn.ensemble.HistGradientBoostingClassifier'