<a href="https://colab.research.google.com/github/Sankeetha-Elancheliyan/CM2604-ML-CW/blob/main/DT_Further_Enhancement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import xgboost as xgb

# Load the dataset
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data', header=None)

# Split the data into features and target
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA on the scaled data
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=0)

# Set the hyperparameter grid for grid search
param_grid = {
    'max_depth': [2, 4, 6],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200]
}

# Perform grid search with cross-validation
xgb_model = xgb.XGBClassifier(random_state=0)
grid_search = GridSearchCV(xgb_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best XGBoost classifier from grid search
xgb_model = grid_search.best_estimator_

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics and confusion matrix
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.91
Precision: 0.89
Recall: 0.88
F1-score: 0.88
Confusion Matrix:
[[762  60]
 [ 69 490]]


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data', header=None)

# Split the data into features and target
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Define the base classifier
base_clf = DecisionTreeClassifier(max_depth=1)

# Define the AdaBoost classifier
ada_clf = AdaBoostClassifier(base_estimator=base_clf, n_estimators=200, learning_rate=0.1, random_state=0)

# Train the AdaBoost classifier
ada_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ada_clf.predict(X_test)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics and confusion matrix
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)




Accuracy: 0.94
Precision: 0.96
Recall: 0.89
F1-score: 0.92
Confusion Matrix:
[[801  21]
 [ 64 495]]
