In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

## Data Prepare

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
dataset_path = '/content/drive/MyDrive/Colab Notebooks/'

In [4]:
# Load and combine CSV files
w1 = pd.read_csv(dataset_path + 'w1.csv')
w2 = pd.read_csv(dataset_path + 'w2.csv')
w3 = pd.read_csv(dataset_path + 'w3.csv')
w4 = pd.read_csv(dataset_path + 'w4.csv')

# Combine all files
combined_data = pd.concat([w1, w2, w3, w4], ignore_index=True)
combined_data.to_csv('combined_data.csv', index=False)

In [5]:
# Shuffle the data
all_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)
all_data.to_csv('all_data.csv', index=False)

## MODEL TRAINING (Basic SVM)

In [6]:
# Separate features and target
X = all_data.iloc[:, :-1]
y = all_data.iloc[:, -1]
print(f"Data shape: {all_data.shape}")
print(f"Features shape: {X.shape}, Target shape: {y.shape}")

results = {}

# 2a) Train-Test Split (70-30)
print("Running train-test split...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")

# Scale the features
print("Scaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Feature scaling completed")

print("Training basic SVM (linear kernel)...")
clf_basic = svm.SVC(random_state=1)
clf_basic.fit(X_train_scaled, y_train)
print("Basic SVM training completed")

print("Making predictions...")
y_pred_basic = clf_basic.predict(X_test_scaled)
accuracy_basic_split = accuracy_score(y_test, y_pred_basic)
print(f"Train-test accuracy: {accuracy_basic_split:.4f}")

results['Original features'] = {'train_test': accuracy_basic_split}

Data shape: (11629, 157)
Features shape: (11629, 156), Target shape: (11629,)
Running train-test split...
Train set: (8140, 156), Test set: (3489, 156)
Scaling features...
Feature scaling completed
Training basic SVM (linear kernel)...
Basic SVM training completed
Making predictions...
Train-test accuracy: 0.9152


In [8]:
# 2b) 10-fold Cross-validation
print("Running 10-fold cross-validation...")
X_scaled = StandardScaler().fit_transform(X)
clf_basic = svm.SVC(random_state=1)
print("Starting cross-validation (this may take a moment)...")
scores_basic_cv = cross_val_score(clf_basic, X_scaled, y, cv=10)
accuracy_basic_cv = scores_basic_cv.mean()
print(f"Cross-validation completed. Average accuracy: {accuracy_basic_cv:.4f}")

results['Original features']['cross_val'] = accuracy_basic_cv

Running 10-fold cross-validation...
Starting cross-validation (this may take a moment)...
Cross-validation completed. Average accuracy: 0.9138


## HYPERPARAMETER TUNING

In [9]:
# Define parameter grid for RBF kernel
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf']
}
print(f"Parameter grid: {len(param_grid['C']) * len(param_grid['gamma'])} combinations to test")

# Grid search with cross-validation
print("Starting GridSearchCV (this will take several minutes)...")
grid_search = GridSearchCV(svm.SVC(random_state=1), param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train)
print("GridSearchCV completed!")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Train with optimal hyperparameters
print("Training with optimal parameters...")
best_clf = grid_search.best_estimator_

Parameter grid: 24 combinations to test
Starting GridSearchCV (this will take several minutes)...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
GridSearchCV completed!
Best parameters: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
Best CV score: 0.9125
Training with optimal parameters...


In [10]:
# 3a) Train-Test with tuned hyperparameters
y_pred_tuned = best_clf.predict(X_test_scaled)
accuracy_tuned_split = accuracy_score(y_test, y_pred_tuned)
print(f"Tuned model train-test accuracy: {accuracy_tuned_split:.4f}")
results['With hyper-parameter tuning'] = {'train_test': accuracy_tuned_split}

Tuned model train-test accuracy: 0.9175


In [11]:
# 3b) Cross-validation with tuned hyperparameters
print("Running cross-validation with tuned parameters...")
clf_tuned_cv = svm.SVC(**grid_search.best_params_, random_state=1)
scores_tuned_cv = cross_val_score(clf_tuned_cv, X_scaled, y, cv=10, verbose=1)
accuracy_tuned_cv = scores_tuned_cv.mean()
print(f"Tuned model cross-validation accuracy: {accuracy_tuned_cv:.4f}")
results['With hyper-parameter tuning']['cross_val'] = accuracy_tuned_cv


Running cross-validation with tuned parameters...
Tuned model cross-validation accuracy: 0.9158


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   59.3s finished


## FEATURE SELECTION

In [12]:
print("Selecting top 100 features...")
selector = SelectKBest(score_func=f_classif, k=100)
X_selected = selector.fit_transform(X_scaled, y)
print(f"Features reduced from {X.shape[1]} to {X_selected.shape[1]}")

# Split the selected features
print("Splitting selected features...")
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(X_selected, y, test_size=0.3, random_state=1)

Selecting top 100 features...
Features reduced from 156 to 100
Splitting selected features...


In [13]:
# 4a) Train-Test with feature selection
print("Training SVM with selected features...")
clf_selected = svm.SVC(**grid_search.best_params_, random_state=1)
clf_selected.fit(X_train_sel, y_train_sel)
y_pred_selected = clf_selected.predict(X_test_sel)
accuracy_selected_split = accuracy_score(y_test_sel, y_pred_selected)
print(f"Feature selection train-test accuracy: {accuracy_selected_split:.4f}")
results['With feature selection and hyper parameter tuning'] = {'train_test': accuracy_selected_split}

Training SVM with selected features...
Feature selection train-test accuracy: 0.9089


In [14]:
# 4b) Cross-validation with feature selection
print("Running cross-validation with selected features...")
scores_selected_cv = cross_val_score(clf_selected, X_selected, y, cv=10)
accuracy_selected_cv = scores_selected_cv.mean()
print(f"Feature selection cross-validation accuracy: {accuracy_selected_cv:.4f}")
results['With feature selection and hyper parameter tuning']['cross_val'] = accuracy_selected_cv

Running cross-validation with selected features...
Feature selection cross-validation accuracy: 0.9082


## DIMENSIONALITY REDUCTION (PCA)

In [15]:
# Apply PCA to get 10 principal components
print("Applying PCA to reduce to 10 components...")
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)
print(f"PCA completed. Explained variance: {pca.explained_variance_ratio_.sum():.4f}")

# Split the PCA features
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=1)


Applying PCA to reduce to 10 components...
PCA completed. Explained variance: 0.7205


In [16]:
# 5a) Train-Test with PCA
print("Training SVM with PCA features...")
clf_pca = svm.SVC(**grid_search.best_params_, random_state=1)
clf_pca.fit(X_train_pca, y_train_pca)
y_pred_pca = clf_pca.predict(X_test_pca)
accuracy_pca_split = accuracy_score(y_test_pca, y_pred_pca)
print(f"PCA train-test accuracy: {accuracy_pca_split:.4f}")
results['With PCA and hyper parameter tuning'] = {'train_test': accuracy_pca_split}

Training SVM with PCA features...
PCA train-test accuracy: 0.9003


In [17]:
# 5b) Cross-validation with PCA
print("Running cross-validation with PCA features...")
scores_pca_cv = cross_val_score(clf_pca, X_pca, y, cv=10)
accuracy_pca_cv = scores_pca_cv.mean()
print(f"PCA cross-validation accuracy: {accuracy_pca_cv:.4f}")
results['With PCA and hyper parameter tuning']['cross_val'] = accuracy_pca_cv

Running cross-validation with PCA features...
PCA cross-validation accuracy: 0.9030


## SUMMARY TABLE (SVM Models)

In [18]:
print("ACTIVITY 6: SVM MODELS SUMMARY TABLE")
print(f"{'SVM Model':<40} {'Train-test split':<20} {'Cross-validation':<20}")
print("-" * 80)
for model_name, scores in results.items():
    train_test_acc = scores['train_test'] * 100
    cross_val_acc = scores['cross_val'] * 100
    print(f"{model_name:<40} {train_test_acc:>18.2f}% {cross_val_acc:>18.2f}%")


ACTIVITY 6: SVM MODELS SUMMARY TABLE
SVM Model                                Train-test split     Cross-validation    
--------------------------------------------------------------------------------
Original features                                     91.52%              91.38%
With hyper-parameter tuning                           91.75%              91.58%
With feature selection and hyper parameter tuning              90.89%              90.82%
With PCA and hyper parameter tuning                   90.03%              90.30%


## THER CLASSIFIERS

In [19]:
other_results = {}

# SGD Classifier
print("Training SGD Classifier...")
sgd_clf = SGDClassifier(random_state=1, max_iter=1000)
sgd_clf.fit(X_train_scaled, y_train)
y_pred_sgd = sgd_clf.predict(X_test_scaled)
sgd_train_test = accuracy_score(y_test, y_pred_sgd)
print("SGD cross-validation...")
sgd_cv_scores = cross_val_score(sgd_clf, X_scaled, y, cv=10)
sgd_cv = sgd_cv_scores.mean()
other_results['SGD'] = {'train_test': sgd_train_test, 'cross_val': sgd_cv}
print(f"SGD completed - Train-Test: {sgd_train_test:.4f}, CV: {sgd_cv:.4f}")


Training SGD Classifier...
SGD cross-validation...
SGD completed - Train-Test: 0.9000, CV: 0.8940


In [20]:
# Random Forest Classifier
print("Training Random Forest Classifier...")
rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)
rf_clf.fit(X_train_scaled, y_train)
y_pred_rf = rf_clf.predict(X_test_scaled)
rf_train_test = accuracy_score(y_test, y_pred_rf)
print("Random Forest cross-validation...")
rf_cv_scores = cross_val_score(rf_clf, X_scaled, y, cv=10)
rf_cv = rf_cv_scores.mean()
other_results['RandomForest'] = {'train_test': rf_train_test, 'cross_val': rf_cv}
print(f"Random Forest completed - Train-Test: {rf_train_test:.4f}, CV: {rf_cv:.4f}")

Training Random Forest Classifier...
Random Forest cross-validation...
Random Forest completed - Train-Test: 0.9249, CV: 0.9257


In [21]:
# MLP Classifier
print("Training MLP Classifier...")
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=1)
mlp_clf.fit(X_train_scaled, y_train)
y_pred_mlp = mlp_clf.predict(X_test_scaled)
mlp_train_test = accuracy_score(y_test, y_pred_mlp)
print("MLP cross-validation...")
mlp_cv_scores = cross_val_score(mlp_clf, X_scaled, y, cv=10)
mlp_cv = mlp_cv_scores.mean()
other_results['MLP'] = {'train_test': mlp_train_test, 'cross_val': mlp_cv}
print(f"MLP completed - Train-Test: {mlp_train_test:.4f}, CV: {mlp_cv:.4f}")

Training MLP Classifier...
MLP cross-validation...
MLP completed - Train-Test: 0.8897, CV: 0.8957


In [22]:
# Add tuned SVM results for comparison
other_results['SVM'] = {'train_test': accuracy_tuned_split, 'cross_val': accuracy_tuned_cv}

# Final Summary Table
print("FINAL COMPARISON: ALL CLASSIFIERS")
print(f"{'Model':<15} {'Train-test split':<20} {'Cross-validation':<20}")
print("-" * 55)
for model_name, scores in other_results.items():
    train_test_acc = scores['train_test'] * 100
    cross_val_acc = scores['cross_val'] * 100
    print(f"{model_name:<15} {train_test_acc:>18.2f}% {cross_val_acc:>18.2f}%")

print("ALL ACTIVITIES COMPLETED!")

FINAL COMPARISON: ALL CLASSIFIERS
Model           Train-test split     Cross-validation    
-------------------------------------------------------
SGD                          90.00%              89.40%
RandomForest                 92.49%              92.57%
MLP                          88.97%              89.57%
SVM                          91.75%              91.58%
ALL ACTIVITIES COMPLETED!
