# StandardScaler only


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Split into features (X) and labels (y)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Preprocess the data (standardize the features)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train the KNN model
kValue = 5
knn = KNeighborsClassifier(n_neighbors=kValue)
knn.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))


Accuracy: 0.8836104513064132
Precision: 0.8942598187311178
Recall: 0.8245125348189415
F1 score: 0.8579710144927535


# Remove duplications, StandardScaler 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Split into features (X) and labels (y)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler

# Preprocess the data (standardize the features)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the KNN model
kValue = 5
knn = KNeighborsClassifier(n_neighbors=kValue)
knn.fit(X_train, y_train)

# Evaluate the model
y_pred = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

Accuracy: 0.8836104513064132
Precision: 0.8942598187311178
Recall: 0.8245125348189415
F1 score: 0.8579710144927535


# PCA, StandardScaler

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Split into features (X) and labels (y)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Preprocess the data (standardize the features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Apply PCA to reduce dimensionality
pca = PCA(n_components=50)  # choose the number of principal components to keep
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Step 5: Train the KNN model
kValue = 5
knn = KNeighborsClassifier(n_neighbors=kValue)
knn.fit(X_train_pca, y_train)

# Step 6: Evaluate the model
y_pred = knn.predict(X_test_pca)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))


Accuracy: 0.8957654723127035
Precision: 0.8972972972972973
Recall: 0.8512820512820513
F1 score: 0.8736842105263158


# Robust Scaler Only

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler

# Load the dataset into a pandas DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.3)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Instantiate a DecisionTreeClassifier object with the desired hyperparameters
out = DecisionTreeClassifier(max_depth=5)

# Train the decision tree classifier using the training set
out.fit(X_train, y_train)

# Use the trained model to make predictions on the testing set
y_pred = out.predict(X_test)

# Evaluate the performance of the model using appropriate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))


Accuracy: 0.9196234612599565
Precision: 0.9172932330827067
Recall: 0.8792792792792793
F1-score: 0.8978840846366145


# Robust Scaler, Remove duplication

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler

# Load the dataset into a pandas DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.3)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Instantiate a DecisionTreeClassifier object with the desired hyperparameters
out = DecisionTreeClassifier(max_depth=5)

# Train the decision tree classifier using the training set
out.fit(X_train, y_train)

# Use the trained model to make predictions on the testing set
y_pred = out.predict(X_test)

# Evaluate the performance of the model using appropriate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))


Accuracy: 0.9136975455265242
Precision: 0.9232456140350878
Recall: 0.8505050505050505
F1-score: 0.8853838065194533


# RobustScaler, Remove Duplications, PCA

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

# Load the dataset into a pandas DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.3)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Instantiate a PCA object with the desired number of components
pca = PCA(n_components=10)

# Fit the PCA object on the training set
X_train = pca.fit_transform(X_train)

# Transform the testing set using the fitted PCA object
X_test = pca.transform(X_test)

# Instantiate a DecisionTreeClassifier object with the desired hyperparameters
out = DecisionTreeClassifier(max_depth=5)

# Train the decision tree classifier using the training set
out.fit(X_train, y_train)

# Use the trained model to make predictions on the testing set
y_pred = out.predict(X_test)

# Evaluate the performance of the model using appropriate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))


Accuracy: 0.8740043446777698
Precision: 0.8080985915492958
Recall: 0.8759541984732825
F1-score: 0.8406593406593406


In [7]:
import pandas as pd
from scipy.stats import shapiro

# Load the dataset into a pandas DataFrame
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Perform the Shapiro-Wilk test on each feature
for i in range(data.shape[1]):
    stat, p = shapiro(data.iloc[:, i])
    if p > 0.05:
        print(f"Feature {i} is normally distributed (p={p:.3f})")
    else:
        print(f"Feature {i} is not normally distributed (p={p:.3f})")

Feature 0 is not normally distributed (p=0.000)
Feature 1 is not normally distributed (p=0.000)
Feature 2 is not normally distributed (p=0.000)
Feature 3 is not normally distributed (p=0.000)
Feature 4 is not normally distributed (p=0.000)
Feature 5 is not normally distributed (p=0.000)
Feature 6 is not normally distributed (p=0.000)
Feature 7 is not normally distributed (p=0.000)
Feature 8 is not normally distributed (p=0.000)
Feature 9 is not normally distributed (p=0.000)
Feature 10 is not normally distributed (p=0.000)
Feature 11 is not normally distributed (p=0.000)
Feature 12 is not normally distributed (p=0.000)
Feature 13 is not normally distributed (p=0.000)
Feature 14 is not normally distributed (p=0.000)
Feature 15 is not normally distributed (p=0.000)
Feature 16 is not normally distributed (p=0.000)
Feature 17 is not normally distributed (p=0.000)
Feature 18 is not normally distributed (p=0.000)
Feature 19 is not normally distributed (p=0.000)
Feature 20 is not normally dis