In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
spambase = fetch_ucirepo(id=94)

# data (as pandas dataframes)
X = spambase.data.features
y = spambase.data.targets

# metadata
print(spambase.metadata)

# variable information
print(spambase.variables)


{'uci_id': 94, 'name': 'Spambase', 'repository_url': 'https://archive.ics.uci.edu/dataset/94/spambase', 'data_url': 'https://archive.ics.uci.edu/static/public/94/data.csv', 'abstract': 'Classifying Email as Spam or Non-Spam', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 4601, 'num_features': 57, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1999, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C53G6X', 'creators': ['Mark Hopkins', 'Erik Reeber', 'George Forman', 'Jaap Suermondt'], 'intro_paper': None, 'additional_info': {'summary': 'The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...\n\nThe classification task for this dataset is to determine whether a given email is spam or not.\n\t\nOur collecti

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import re
import string


In [None]:
data = pd.read_csv('/content/spambase.data', header=None)
documentation = open('/content/spambase.DOCUMENTATION').read()
names = open('/content/spambase.names').read()


In [None]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [None]:
print(data.head())

     0     1     2    3     4     5     6     7     8     9   ...    48  \
0  0.00  0.64  0.64  0.0  0.32  0.00  0.00  0.00  0.00  0.00  ...  0.00   
1  0.21  0.28  0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94  ...  0.00   
2  0.06  0.00  0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25  ...  0.01   
3  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   
4  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   

      49   50     51     52     53     54   55    56  57  
0  0.000  0.0  0.778  0.000  0.000  3.756   61   278   1  
1  0.132  0.0  0.372  0.180  0.048  5.114  101  1028   1  
2  0.143  0.0  0.276  0.184  0.010  9.821  485  2259   1  
3  0.137  0.0  0.137  0.000  0.000  3.537   40   191   1  
4  0.135  0.0  0.135  0.000  0.000  3.537   40   191   1  

[5 rows x 58 columns]


In [None]:
X = X.apply(lambda x: ' '.join(x.astype(str)), axis=1)
print(X.head())

0    0.0 0.64 0.64 0.0 0.32 0.0 0.0 0.0 0.0 0.0 0.0...
1    0.21 0.28 0.5 0.0 0.14 0.28 0.21 0.07 0.0 0.94...
2    0.06 0.0 0.71 0.0 1.23 0.19 0.19 0.12 0.64 0.2...
3    0.0 0.0 0.0 0.0 0.63 0.0 0.31 0.63 0.31 0.63 0...
4    0.0 0.0 0.0 0.0 0.63 0.0 0.31 0.63 0.31 0.63 0...
dtype: object


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)

In [None]:
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)


In [None]:
def evaluate_model(y_test, y_pred, model_name):
    print(f"Evaluating {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}\n")

In [None]:
evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_nb, "Naive Bayes")
evaluate_model(y_test, y_pred_svm, "Support Vector Machine")

Evaluating Logistic Regression
Accuracy: 0.6796959826275787
Precision: 0.6954732510288066
Recall: 0.43333333333333335
F1 Score: 0.5339652448657188
Confusion Matrix: 
[[457  74]
 [221 169]]

Evaluating Naive Bayes
Accuracy: 0.6243213897937026
Precision: 0.75
Recall: 0.16923076923076924
F1 Score: 0.2761506276150627
Confusion Matrix: 
[[509  22]
 [324  66]]

Evaluating Support Vector Machine
Accuracy: 0.6959826275787188
Precision: 0.7594339622641509
Recall: 0.4128205128205128
F1 Score: 0.5348837209302325
Confusion Matrix: 
[[480  51]
 [229 161]]



In [None]:
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.2s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   1.3s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   1.3s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   1.1s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.9s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.9s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.9s
[CV] END ......................C=0.1, gamma=0.01

In [None]:
print(f"Best Parameters for SVM: {grid.best_params_}")
grid_predictions = grid.predict(X_test_tfidf)

evaluate_model(y_test, grid_predictions, "Tuned SVM")

Best Parameters for SVM: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Evaluating Tuned SVM
Accuracy: 0.7285559174809989
Precision: 0.7317880794701986
Recall: 0.5666666666666667
F1 Score: 0.638728323699422
Confusion Matrix: 
[[450  81]
 [169 221]]



In [None]:
def predict_email(text):
    text_tfidf = vectorizer.transform([text])
    prediction = grid.predict(text_tfidf)
    return "Spam" if prediction == 1 else "Ham"

In [None]:
email_text = "Free money!!!"
print(f"The email is classified as: {predict_email(email_text)}")

The email is classified as: Ham


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
data = pd.read_csv('data.csv')
print(data.head())
X = data.drop('target_column_name', axis=1)
y = data['target_column_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
joblib.dump(rf_model, 'random_forest_model.pkl')