In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

file_path = 'spam.csv'
data = pd.read_csv(file_path, encoding='latin1')

print(data.head())

   make  address   all   3d   our  over  remove  internet  order  mail  ...  \
0  0.00     0.00  0.29  0.0  0.00  0.00    0.00      0.00   0.00  0.00  ...   
1  0.46     0.00  0.00  0.0  0.00  0.00    0.00      0.00   0.00  0.00  ...   
2  0.00     0.00  0.00  0.0  0.00  0.00    0.00      0.00   0.00  0.00  ...   
3  0.33     0.44  0.37  0.0  0.14  0.11    0.00      0.07   0.97  1.16  ...   
4  0.00     2.08  0.00  0.0  3.12  0.00    1.04      0.00   0.00  0.00  ...   

   semicol  paren  bracket   bang  dollar  pound  cap_avg  cap_long  \
0    0.000  0.178      0.0  0.044   0.000   0.00    1.666        10   
1    0.000  0.125      0.0  0.000   0.000   0.00    1.510        10   
2    0.000  0.000      0.0  0.000   0.000   0.00    1.718        11   
3    0.006  0.159      0.0  0.069   0.221   0.11    3.426        72   
4    0.000  0.000      0.0  0.263   0.000   0.00    1.428         4   

   cap_total  Class  
0        180    ham  
1         74    ham  
2         55    ham  
3        8

In [None]:
label_encoder = LabelEncoder()
data['Class'] = label_encoder.fit_transform(data['Class'])

X = data.drop(columns=['Class'])
y = data['Class']

X_train_initial, X_test_initial, y_train_initial, y_test_initial = X[:1000], X[1000:], y[:1000], y[1000:]

In [None]:
base_learner = DecisionTreeClassifier(max_depth=1, random_state=42)
adaboost_clf = AdaBoostClassifier(estimator=base_learner, n_estimators=50, random_state=42)

In [None]:
adaboost_clf.fit(X_train_initial, y_train_initial)

y_pred_initial = adaboost_clf.predict(X_test_initial)

print("Initial Split (1000/3601) Accuracy:", accuracy_score(y_test_initial, y_pred_initial))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_initial, y_pred_initial))
print("\nClassification Report:\n", classification_report(y_test_initial, y_pred_initial))

Initial Split (1000/3601) Accuracy: 0.9197445154123854

Confusion Matrix:
 [[2049  133]
 [ 156 1263]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.93      2182
           1       0.90      0.89      0.90      1419

    accuracy                           0.92      3601
   macro avg       0.92      0.91      0.92      3601
weighted avg       0.92      0.92      0.92      3601



In [None]:
splits = [0.5, 0.6, 0.7, 0.8]
results = {}

for split in splits:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-split, random_state=42)

    adaboost_clf.fit(X_train, y_train)

    y_pred = adaboost_clf.predict(X_test)

    results[split] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred, output_dict=True),
        'Confusion Matrix': confusion_matrix(y_test, y_pred)
    }
s
for split, metrics in results.items():
    print(f"\nSplit {int(split*100)}-{int((1-split)*100)}%:")
    print("Accuracy:", metrics['Accuracy'])
    print("\nConfusion Matrix:\n", metrics['Confusion Matrix'])
    print("\nClassification Report:")
    for label, report in metrics['Classification Report'].items():
        if isinstance(report, dict):
            print(f"  Class {label}:")
            print(f"    Precision: {report['precision']:.2f}")
            print(f"    Recall: {report['recall']:.2f}")
            print(f"    F1-score: {report['f1-score']:.2f}")


Split 50-50%:
Accuracy: 0.9317687961755758

Confusion Matrix:
 [[1308   93]
 [  64  836]]

Classification Report:
  Class 0:
    Precision: 0.95
    Recall: 0.93
    F1-score: 0.94
  Class 1:
    Precision: 0.90
    Recall: 0.93
    F1-score: 0.91
  Class macro avg:
    Precision: 0.93
    Recall: 0.93
    F1-score: 0.93
  Class weighted avg:
    Precision: 0.93
    Recall: 0.93
    F1-score: 0.93

Split 60-40%:
Accuracy: 0.9359043997827268

Confusion Matrix:
 [[1060   74]
 [  44  663]]

Classification Report:
  Class 0:
    Precision: 0.96
    Recall: 0.93
    F1-score: 0.95
  Class 1:
    Precision: 0.90
    Recall: 0.94
    F1-score: 0.92
  Class macro avg:
    Precision: 0.93
    Recall: 0.94
    F1-score: 0.93
  Class weighted avg:
    Precision: 0.94
    Recall: 0.94
    F1-score: 0.94

Split 70-30%:
Accuracy: 0.9283128167994207

Confusion Matrix:
 [[776  61]
 [ 38 506]]

Classification Report:
  Class 0:
    Precision: 0.95
    Recall: 0.93
    F1-score: 0.94
  Class 1:
    Pre