In [1]:
import pandas as pd
import seaborn as sns

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report

In [6]:
df = pd.read_csv("./data/preprocessed_spam_ham.csv")

df_spam = df[df['label']=='spam']
df_ham = df[df['label']=='ham']


df_ham_downsampled = resample(df_ham, replace=False, n_samples=len(df_spam), random_state=42)
resampled_df = pd.concat([df_ham_downsampled, df_spam])

x = resampled_df['text'].to_list()
y = resampled_df['label_num']

vectorizer = TfidfVectorizer()
x_transformed = vectorizer.fit_transform(x) 

In [14]:
cl = RandomForestClassifier()
hp = {
    'criterion': ("gini", "entropy"), 
    'max_depth': [1, 3, 5, 10, 20],
    'min_samples_split': [20, 40, 200, 1000],
    'min_samples_leaf': [10, 20, 100, 500]
}
clf = RandomizedSearchCV(cl, hp)
search = clf.fit(x_transformed, y)

In [16]:
search.best_params_

{'min_samples_split': 40,
 'min_samples_leaf': 20,
 'max_depth': 10,
 'criterion': 'entropy'}

In [None]:
search.best_score_