In [1]:
# Import all needed libraries
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import f1_score
from scipy.stats import uniform, randint

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Authorize access to your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Read CSV files into dataframes
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 10/train.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 10/valid.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 10/test.csv')

In [4]:
train_df.shape

(28520, 772)

In [5]:
train_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768,label_1,label_2,label_3,label_4
0,-0.027083,0.072947,-0.093659,0.053418,-0.085516,-0.10261,-0.021217,0.016162,-0.184269,0.110335,...,-0.183643,0.091299,-0.037097,0.042607,-0.034361,-0.013748,45,,1,6
1,0.070195,0.228641,-0.13286,-0.077761,-0.054993,-0.210365,0.127747,-0.132385,-0.161366,0.172764,...,-0.123668,0.029626,-0.027345,0.055223,-0.179725,0.136841,45,,1,6
2,0.164312,0.052808,-0.05851,0.104724,-0.025886,-0.101427,-0.047177,0.091298,-0.094569,0.088062,...,0.07541,0.070125,0.043022,0.012972,-0.02892,0.096725,45,,1,6
3,0.02973,0.113737,0.061113,-0.099329,-0.1116,-0.245942,0.08652,0.071996,0.028319,0.20791,...,-0.062511,-0.226912,-0.046011,0.011282,-0.095167,0.039979,45,,1,6
4,0.031364,0.142409,-0.160743,-0.076594,-0.062412,-0.264732,0.079197,0.02606,-0.217023,0.084656,...,-0.193882,0.107297,-0.042355,0.046763,-0.192469,0.006463,45,,1,6


## **Data Preprocessing**

In [6]:
# Identify columns with null values and count how many missing values each of those columns has.
missing_columns = train_df.columns[train_df.isnull().any()]
missing_counts = train_df[missing_columns].isnull().sum()

print('Missing Columns and Number of Counts')
for column in missing_columns:
    print( str(column) +' : '+ str(missing_counts[column]))

Missing Columns and Number of Counts
label_2 : 480


In [7]:
# Create new copies of the dataframes
train_data = train_df.copy()
valid_data = valid_df.copy()
test_data = test_df.copy()

In [8]:
train_df.describe()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768,label_1,label_2,label_3,label_4
count,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,...,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28040.0,28520.0,28520.0
mean,0.035587,0.078413,-0.040854,0.079167,-0.033728,-0.129528,-0.004608,-0.016071,-0.080182,0.047877,...,-0.121131,0.024537,-0.023062,0.053033,-0.06011,-0.031766,30.498843,27.975107,0.799299,5.997125
std,0.066663,0.05186,0.056119,0.061433,0.055724,0.078193,0.055673,0.067243,0.070111,0.061358,...,0.082048,0.065089,0.063021,0.081957,0.061217,0.074294,17.328389,5.735913,0.400532,2.375567
min,-0.278307,-0.108783,-0.374459,-0.23066,-0.382474,-0.464895,-0.218843,-0.385482,-0.38647,-0.396842,...,-0.400943,-0.261302,-0.271254,-0.383916,-0.350115,-0.296535,1.0,22.0,0.0,0.0
25%,-0.003948,0.044925,-0.074197,0.040072,-0.06926,-0.182223,-0.041694,-0.057925,-0.123943,0.008245,...,-0.172992,-0.015233,-0.067517,-0.006157,-0.099727,-0.08162,15.0,25.0,1.0,6.0
50%,0.03288,0.07656,-0.039102,0.079309,-0.031002,-0.124762,-0.006371,-0.015115,-0.077953,0.046913,...,-0.129561,0.023859,-0.02733,0.050707,-0.056456,-0.031862,30.0,27.0,1.0,6.0
75%,0.068754,0.10938,-0.005096,0.11844,0.004372,-0.072187,0.03063,0.025157,-0.036984,0.086407,...,-0.082763,0.062431,0.016195,0.110795,-0.019083,0.017753,46.0,30.0,1.0,6.0
max,0.537892,0.365866,0.329586,0.397721,0.214701,0.215949,0.461929,0.373033,0.433161,0.333654,...,0.345534,0.487504,0.596317,0.34857,0.334053,0.395296,60.0,61.0,1.0,13.0


Only a small number of missing values **(480)** are found in the dataset compared to the total number of rows **(28520)**. Therefore, these missing rows are decided to be **removed** from the dataset, since their **impact is much smaller and negligible**

In [9]:
x_train = {}
x_valid = {}
x_test = {}

y_train = {}
y_valid = {}
y_test = {}

# Create dictionaries for each of the labels
for target_label in ['label_1','label_2','label_3','label_4']:

  # Select only rows with non-null values
  if target_label == "label_2":
    train = train_df[train_df['label_2'].notna()]
    valid = valid_df[valid_df['label_2'].notna()]

  else:
    train = train_df
    valid = valid_df

  test = test_df

  # A data preprocessing technique to scale and transform dataset features while minimizing sensitivity to outliers.
  scaler = RobustScaler()

  x_train[target_label] = pd.DataFrame(scaler.fit_transform(train.drop(['label_1','label_2','label_3','label_4'], axis=1)), columns=[f'feature_{i}' for i in range(1,769)])
  y_train[target_label] = train[target_label]

  x_valid[target_label] = pd.DataFrame(scaler.transform(valid.drop(['label_1','label_2','label_3','label_4'], axis=1)), columns=[f'feature_{i}' for i in range(1,769)])
  y_valid  [target_label] = valid[target_label]

  x_test[target_label] = pd.DataFrame(scaler.transform(test.drop(["ID"],axis=1)), columns=[f'feature_{i}' for i in range(1,769)])

# **Label 04**

In [10]:
# Create new copies
x_train_df = x_train['label_4'].copy()
y_train_df = y_train['label_4'].copy()

x_valid_df = x_valid['label_4'].copy()
y_valid_df = y_valid['label_4'].copy()

x_test_df = x_test['label_4'].copy()

## **K-Fold Cross-Validation Approach**
This technique is used to assess machine learning model performance by dividing the dataset into K subsets, training the model K times, and averaging the results. It helps reduce the impact of data splitting variability, maximize data utilization, and useful for model selection.

In [11]:
# Perform 5-fold cross-validation Process
scores = cross_val_score(SVC(), x_train_df, y_train_df, cv=5, scoring='accuracy')

mean_accuracy = scores.mean()
std_accuracy = scores.std()

# Print the obtained cross-validation scores
print('Support Vector Machines')
print('\n')
print("Cross-validation scores: ", scores)
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_accuracy:.2f}")

Support Vector Machines


Cross-validation scores:  [0.92934783 0.92969846 0.9437237  0.95371669 0.93443198]
Mean Accuracy: 0.94
Standard Deviation: 0.01


## **Feature Selection - Select K Best Approach**
This is a common technique for feature selection and visualization in machine learning, helping to identify and focus on the most important patterns and relationships within the data.

This technique is applied to obtain the top **450** features based on the **F-statistic** (ANOVA) score. This step reduces the dimensionality of the input data. The accuracy score obtained after performing this technique **(0.948)** is **slightly higher** than the score obtained before feature selection **(0.94)**. **This indicates the need of feature selection to build an optimized model**.

In [12]:
# Perform Select K-Best Approach to select top 450 features
selector = SelectKBest(f_classif, k=450)
x_train_1_1 = selector.fit_transform(x_train_df, y_train_df)

# Print the obtained shape after applying Select K-Best Approach
print('Shape after SelectKBest Approach : ', x_train_1_1.shape)

# Perform Support Vector classification (SVC) model approach after feature selection using Select K-Best approach
classifier = SVC(kernel='linear')
classifier.fit(x_train_1_1,y_train_df)
y_predict = classifier.predict(selector.transform(x_valid_df))

# Print the obtained Accuracy score, Precision score and the Recall score
print("Accuracy Score: ", metrics.accuracy_score(y_valid_df,y_predict))
print("Precision Score: ", metrics.precision_score(y_valid_df,y_predict, average='weighted'))
print("Recall Score: ", metrics.recall_score(y_valid_df,y_predict, average='weighted'))

Shape after SelectKBest Approach :  (28520, 450)
Accuracy Score:  0.948
Precision Score:  0.9484329493067523
Recall Score:  0.948


## **Feature Selection - Principal Component Analysis (PCA)**
This is a common technique for feature selection and visualization in machine learning, helping to identify and focus on the most important patterns and relationships within the data.

This technique is applied here to improve the performance of the model by increasing its accuracy score. Number of best features obtained here is **388**.

In [13]:
# Perform Principal Component Analysis (PCA) Approach
pca = PCA(n_components=0.975, svd_solver='full')
pca.fit(x_train_df)

# Create new dataframes by performing PCA transformation
x_train_df_pca = pd.DataFrame(pca.transform(x_train_df))
x_valid_df_pca = pd.DataFrame(pca.transform(x_valid_df))
x_test_df_pca = pd.DataFrame(pca.transform(x_test_df))

# Print the obtained shape after applying PCA approach
print('Shape after PCA: ',x_train_df_pca.shape)

Shape after PCA:  (28520, 388)


# **Build Classifier Models**
Both Support Vector Classifier model and Random Forest Classifier model are built, optimized and evaluated based on the accuracy score obtained to choose the best model

## **Support Vector Machine (SVM)**
Support Vector Machines (SVM) are effective for both linear and non-linear classification tasks. Support Vector Classification (SVC) is a variant of SVM used for classification tasks, where the goal is to separate data points into different classes using a hyperplane while maximizing the margin between the classes.

SVC model is built for the dataframe obtained after PCA approach. The accuracy score obtained here **(0.9413)** is **slightly higher** than the accuracy score obtained **(0.94)** on initial dataframe before feature selection.

In [15]:
# Perform Support Vector classification (SVC) model approach
classifier = SVC(kernel='linear', C=1)
classifier.fit(x_train_df_pca, y_train_df)
y_valid_pred = classifier.predict(x_valid_df_pca)

# Print the obtained accuracy score after applying SVC approach
print("Accuracy Score: ",metrics.accuracy_score(y_valid_df, y_valid_pred))

Accuracy Score:  0.9413333333333334


## **Hyperparameter Tuning**
This approach is used to optimize a machine learning model's performance by finding the best set of hyperparameters. These hyperparameters control the behavior of the model and fine-tuning them can significantly impact the model's accuracy and generalization to new data.

In [None]:
# Dictionary that defines hyperparameters list to search over during tuning process.
param_dist = {
    'C': [100,10,1,0,0.1,0.01],
    'kernel': ['rbf','linear','poly','sigmoid'],
    'gamma': ['scale','auto'],
    'degree': [1,2,3,4],
    'class_weight' : ['none','balanced']
}

svm = SVC()

# Perform randomized hyperparameter search
random_search = RandomizedSearchCV(
    svm, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42, scoring='accuracy'
)
random_search.fit(x_train_df_pca, y_train_df)

# Retrieve the best hyperparameters obtained.
best_params = random_search.best_params_
best_model = random_search.best_estimator_
print("Best Parameters: ", best_params)

best parameters: {'kernel': 'rbf', 'gamma': 'scale', 'degree': 4, 'class_weight': 'balanced', 'C': 100}


## **Support Vector Classifier** model with **Best Hyper Parameters**
This is done to build a classification model that is fine-tuned for the specific problem. The use of obtained hyper parameters have improved the accuracy score **(0.984)** by a **much significant value** compared to accuracy score **(0.9413)** obtained before Hyper-Parameter Tuning.

In [16]:
# Perform Support Vector Classification (SVC) model
classifier = SVC(kernel='rbf', C=100, gamma='scale', degree=4, class_weight='balanced')
classifier.fit(x_train_df_pca, y_train_df)
y_valid_pred = classifier.predict(x_valid_df_pca)

# Print the obtained accuracy score after applying SVC approach
print("Accuracy Score: ",metrics.accuracy_score(y_valid_df, y_valid_pred))

# Make predictions on the test dataset
Prediction_SVC_PCA = classifier.predict(x_test_df_pca)

Accuracy Score:  0.984


## **Random Forest Classifier Model**
This is a classifier model that combines multiple decision trees to make predictions. It is known for its high accuracy and robustness against overfitting.

Here, the **Random Forest Classifier** model is built on the initial dataframe before feature selection. The accuracy score obtained **(0.7666)** is **much lower** than the value obtained from **Support Vector Classifier** model **(0.94)**. So, **Random Forest Classifier model is not the best model for this label**.

In [17]:
# Perform Random Forest Classifier model on the initial dataframe
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(x_train_df , y_train_df)
y_valid_pred = classifier.predict(x_valid_df)

# Print the obtained accuracy score after applying Random Forest approach
print("Accuracy Score: ",metrics.accuracy_score(y_valid_df, y_valid_pred))

# Make predictions on the test dataset
Prediction_RF = classifier.predict(x_test_df)

Accuracy Score:  0.7666666666666667


Here, the **Random Forest Classifier** model is built on the dataframe after feature selection through PCA approach. The accuracy score obtained **(0.736)** is **much lower** compared to the value obtained from **Support Vector Classifier** model after feature selection **(0.9413)**. So, **Random Forest Classifier model is not the best model for this label**.

In [18]:
# Perform Random Forest Classifier model on the dataframe obtained after performing PCA Approach
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(x_train_df_pca , y_train_df)
y_valid_pred = classifier.predict(x_valid_df_pca)

# Print the obtained accuracy score after applying Random Forest approach
print("Accuracy Score: ",metrics.accuracy_score(y_valid_df, y_valid_pred))

# Make predictions on the test dataset
Prediction_RF_PCA = classifier.predict(x_test_df_pca)

Accuracy Score:  0.736


## **Output CSV File Generation**
The initial accuracy score is **(0.94)**. Best model chosen is **Support Vector Classifier Model with best hyper parameters after feature selection using PCA approach**. Because the accuracy score obtained here **(0.984)** is **higher** than the score obtained from others. Therefore, predictions made by this model is used to create the output CSV file for submission.

In [None]:
# Create the output CSV file
output_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 10/solutions.csv')
output_df['label_4'] = Prediction_SVC_PCA
output_df.to_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 10/solutions.csv',index=False)