In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv("Spam_Ham_data.csv")

In [3]:
data

Unnamed: 0,email,label,Subject,content
0,b'From exmh-workers-admin@redhat.com Thu Aug ...,0.0,new sequenc window,date wed number aug number number number chri ...
1,b'Return-Path: <Online#3.19578.34-UgGTgZFN19NA...,0.0,cnet newscom cabl compani crack wifi,htmlhead titlec compani crack nametopa logo ad...
2,b'Return-Path: <Online#3.19584.83-p1SYlJ1blFvQ...,0.0,save extra number ipaq number pda cnet shopper,htmlhead newslett hreftopa header tabl widthnu...
3,b'From Steve_Burt@cursor-system.com Thu Aug 2...,0.0,zzzzteana alexand,martin greek sculptor behind plan judg limesto...
4,b'Return-Path: <Online#3.19586.b5-9w0blztbvHPd...,0.0,week deck texedit plu boom,cnet download dispatchmac editionjuli number n...
...,...,...,...,...
3058,b'From 2002biz2biz2513@Flashmail.com Mon Oct ...,1.0,fwddirect market work number,stumbl greatest way market centuri undoubtedli...
3059,b'From biz2biz2446@Flashmail.com Mon Oct 7 2...,1.0,see compani sale sky rocket number,stumbl greatest way market centuri undoubtedli...
3060,b'From bounce2@u-answer.com Tue Oct 8 11:02:...,1.0,number hour watch emmerci joke,frontpag numbermeta nameprogid vlinknumb align...
3061,b'From beautyinfufuxxxmeb13mxy@aol.com Tue Oc...,1.0,make fortun ebay number,htmlbodi tr td p number rate work home busi bf...


In [4]:
data = data.fillna('')

In [5]:
X = data['Subject'] + ' ' + data['content']
y = data['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
  pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [8]:
param_dist = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__max_features': [500, 1000, 2000],
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10]
}

In [9]:
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, cv=5, n_iter=10, random_state=42)
random_search.fit(X_train, y_train)

In [10]:
best_model = random_search.best_estimator_
best_params = random_search.best_params_


In [11]:
best_model.fit(X_train, y_train)


In [12]:
y_pred = best_model.predict(X_test)

In [13]:
accuracy = accuracy_score(y_test, y_pred)

In [14]:
print("Accuracy:", accuracy)
print("Best Hyperparameters:", best_params)


Accuracy: 0.9690048939641109
Best Hyperparameters: {'vectorizer__ngram_range': (1, 2), 'vectorizer__max_features': 2000, 'classifier__n_estimators': 200, 'classifier__min_samples_split': 5, 'classifier__max_depth': None}
