In [1]:
import pandas as pd
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
)
# ML models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.metrics import classification_report, f1_score

from os import cpu_count


In [2]:
def split_feature_label(df, label_col):
    return df.drop(columns=label_col), df[label_col]

def test_lr_on(df_train, df_test, feature_col):
    X_train, y_train = split_feature_label(df_train, feature_col)
    X_test, y_test = split_feature_label(df_test, feature_col)
    lr = LogisticRegression(random_state=42, C=0.69, penalty='l1')
    lr.fit(X_train, y_train)
    return lr.score(X_test, y_test)

def test_rf_on(df_train, df_test, feature_col):
    X_train, y_train = split_feature_label(df_train, feature_col)
    X_test, y_test = split_feature_label(df_test, feature_col)
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    rf.fit(X_train, y_train)
    return rf.score(X_test, y_test)


In [3]:
df_50_train = pd.read_csv("spam_train_ae_compressed_to_50_features.csv", index_col='id_of_the_text')
df_50_test = pd.read_csv("spam_test_ae_compressed_to_50_features.csv", index_col='id_of_the_text')
df_100_train = pd.read_csv("spam_train_ae_compressed_to_100_features.csv", index_col='id_of_the_text')
df_100_test = pd.read_csv("spam_test_ae_compressed_to_100_features.csv", index_col='id_of_the_text')
df_500_train = pd.read_csv("spam_train_ae_compressed_to_500_features.csv", index_col='id_of_the_text')
df_500_test = pd.read_csv("spam_test_ae_compressed_to_500_features.csv", index_col='id_of_the_text')

In [4]:
label = 'Label'

In [5]:
print(test_lr_on(df_50_train, df_50_test, label))
print(test_lr_on(df_100_train, df_100_test, label))
print(test_lr_on(df_500_train, df_500_test, label))

0.5493716337522442
0.4781567923399162
0.38539796529024534




In [6]:
print(test_rf_on(df_50_train, df_50_test, label))
print(test_rf_on(df_100_train, df_100_test, label))
print(test_rf_on(df_500_train, df_500_test, label))

0.6929982046678635
0.7498503889886295
0.6630760023937762


In [8]:
df_50_svd_train = pd.read_csv("spam_train_bag_of_words_with_svd_50.csv", index_col='Unnamed: 0')
df_50_svd_test = pd.read_csv("spam_test_bag_of_words_with_svd_50.csv", index_col='Unnamed: 0')
df_100_svd_train = pd.read_csv("spam_train_bag_of_words_with_svd_100.csv", index_col='Unnamed: 0')
df_100_svd_test = pd.read_csv("spam_test_bag_of_words_with_svd_100.csv", index_col='Unnamed: 0')
df_500_svd_train = pd.read_csv("spam_train_bag_of_words_with_svd_500.csv", index_col='Unnamed: 0')
df_500_svd_test = pd.read_csv("spam_test_bag_of_words_with_svd_500.csv", index_col='Unnamed: 0')

In [9]:
print(test_lr_on(df_50_svd_train, df_50_svd_test, label))
print(test_lr_on(df_100_svd_train, df_100_svd_test, label))
print(test_lr_on(df_500_svd_train, df_500_svd_test, label))

0.8659485338120886
0.8653500897666068
0.8641532016756434




In [10]:
print(test_rf_on(df_50_svd_train, df_50_svd_test, label))
print(test_rf_on(df_100_svd_train, df_100_svd_test, label))
print(test_rf_on(df_500_svd_train, df_500_svd_test, label))

0.8539796529024536
0.8635547576301615
0.8713345302214243


# Original dataset

In [13]:
spam_train = pd.read_csv("spam_train_bag_of_words.csv", index_col='Unnamed: 0')

In [14]:
spam_test = pd.read_csv("spam_test_bag_of_words.csv", index_col='Unnamed: 0')

In [15]:
spam_test.head()

Unnamed: 0,Label,go,jurong,point,crazi,avail,bugi,n,great,world,...,guy_bitch,bitch_act,act_like,like_interest,interest_buy,buy_someth,els_next,week_gave,gave_us,us_free
text1,ham,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
text2,spam,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
text3,ham,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
text4,ham,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
text5,spam,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
label = 'Label'
print(test_rf_on(spam_train, spam_test, label))

print(test_lr_on(spam_train, spam_test, label))

0.8659485338120886




0.8886894075403949
