In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

In [2]:
df = pd.read_csv("./data/CleanedData.csv")

In [3]:
# Separate target(label) from predictor columns
y = df.label
X = df.drop('label', axis=1)

In [4]:
# Split full dataset into training set(80%) and testing set(20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [5]:
df.columns

Index(['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls',
       'emailDomain', 'generalConsumer', 'govDomain', 'eduDomain', 'orgDomain',
       'netDomain', 'otherDomain', 'html', 'punctuationCount', 'fullContent',
       'subjectLength', 'bodyLength', 'totalLength'],
      dtype='object')

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000, max_df=0.95, min_df=2)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['fullContent'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

sparse_features = csr_matrix(df[["urls", "totalLength", "generalConsumer", "govDomain", "eduDomain", "orgDomain", "netDomain", "otherDomain", "html", "punctuationCount"]].values)

X = hstack([sparse_features, tfidf_matrix])