# Spam Detector

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Retrieve the Data

In [2]:
data = pd.read_csv('spam-data.csv')

In [3]:
data.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


## Split the Data into Training and Testing Sets

In [4]:
X = data.drop(columns=['spam'])
y = data['spam']

In [5]:
y.value_counts()

spam
0    2788
1    1813
Name: count, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

## Scale the Features

In [7]:
scaler = StandardScaler()

In [8]:
X_train_scaled = scaler.fit_transform(X_train)

In [9]:
X_test_scaled = scaler.transform(X_test)

## Create and Evaluate Models

### Logistic Regression Model

In [10]:
log_model = LogisticRegression(random_state=1)
log_model.fit(X_train_scaled, y_train)

In [11]:
log_preds = log_model.predict(X_test_scaled)

In [12]:
log_accuracy = accuracy_score(y_test, log_preds)
print(f"Logistic Regression Accuracy: {log_accuracy}")

Logistic Regression Accuracy: 0.9261404779145547


### Random Forest Model

In [13]:
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train_scaled, y_train)

In [14]:
rf_preds = rf_model.predict(X_test_scaled)

In [15]:
rf_accuracy = accuracy_score(y_test, rf_preds)
print(f"Random Forest Accuracy: {rf_accuracy}")

Random Forest Accuracy: 0.9594496741491673


## Compare Models

### Which model performed better?
Random Forest performed better

### Did the results align with your expectations?
Yes