In [None]:
# Import all needed libraries
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import f1_score
from scipy.stats import uniform, randint

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Authorize access to your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Read CSV files into dataframes
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 9/train.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 9/valid.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 9/test.csv')

In [None]:
train_df.shape

(28520, 772)

In [None]:
train_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768,label_1,label_2,label_3,label_4
0,0.019301,0.059756,0.081375,0.057481,-0.06844,-0.165913,0.035643,-0.091138,0.021688,0.057158,...,-0.035576,0.127319,0.098128,-0.058787,0.100971,-0.047754,45,,1,6
1,0.049741,0.09003,0.035118,-0.013676,-0.194317,-0.101763,0.085875,-0.081317,0.112418,0.120523,...,0.020538,0.058968,0.029803,0.111324,0.036727,0.031927,45,,1,6
2,0.019212,0.087779,0.093907,-0.033738,-0.141409,-0.062881,-0.071402,-0.006599,0.020372,-0.027777,...,0.119645,-0.040861,0.000548,-0.061003,-0.04245,0.06334,45,,1,6
3,0.070283,0.04904,0.042126,0.122637,-0.056964,-0.1137,0.108454,0.051336,0.08661,0.141578,...,-0.124494,-0.169225,-0.046391,0.148787,0.014616,-0.140644,45,,1,6
4,0.028864,0.165634,0.016302,0.036117,-0.028871,-0.147748,0.05318,0.025071,-0.0042,-0.022183,...,-0.124862,0.044907,0.084005,-0.03845,0.084371,-0.072146,45,,1,6


## **Data Preprocessing**

In [None]:
# Identify columns with null values and count how many missing values each of those columns has.
missing_columns = train_df.columns[train_df.isnull().any()]
missing_counts = train_df[missing_columns].isnull().sum()

print('Missing Columns and Number of Counts')
for column in missing_columns:
    print( str(column) +' : '+ str(missing_counts[column]))

Missing Columns and Number of Counts
label_2 : 480


In [None]:
# Create new copies of the dataframes
train_data = train_df.copy()
valid_data = valid_df.copy()
test_data = test_df.copy()

In [None]:
train_df.describe()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768,label_1,label_2,label_3,label_4
count,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,...,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28520.0,28040.0,28520.0,28520.0
mean,-0.015964,0.075453,0.045892,0.04623,-0.068548,-0.090882,0.034405,-0.038091,0.049623,0.036286,...,-0.058015,0.017027,-0.00297,0.053544,0.033636,0.003864,30.498843,27.975107,0.799299,5.997125
std,0.048122,0.047099,0.059531,0.056354,0.062569,0.054895,0.065229,0.058487,0.051186,0.060626,...,0.075779,0.05599,0.055413,0.082085,0.070793,0.059617,17.328389,5.735913,0.400532,2.375567
min,-0.300831,-0.113586,-0.183329,-0.21218,-0.385634,-0.331156,-0.320988,-0.519168,-0.171353,-0.404408,...,-0.343703,-0.401326,-0.355695,-0.390809,-0.470045,-0.242932,1.0,22.0,0.0,0.0
25%,-0.048373,0.046609,0.004613,0.007529,-0.109321,-0.128968,0.001598,-0.073965,0.015015,-0.001981,...,-0.105128,-0.01495,-0.039802,0.00327,-0.006049,-0.038369,15.0,25.0,1.0,6.0
50%,-0.017313,0.073738,0.049874,0.044959,-0.068874,-0.089224,0.039623,-0.034788,0.050111,0.036198,...,-0.069593,0.021222,-0.003279,0.057083,0.036988,0.004224,30.0,27.0,1.0,6.0
75%,0.015003,0.101526,0.087481,0.082385,-0.026185,-0.05201,0.076055,0.002081,0.085094,0.07409,...,-0.028467,0.053432,0.033408,0.109684,0.078949,0.044731,46.0,30.0,1.0,6.0
max,0.31681,0.519338,0.344814,0.443703,0.169458,0.298595,0.309478,0.246881,0.260366,0.270373,...,0.388045,0.215233,0.20544,0.332756,0.269439,0.385349,60.0,61.0,1.0,13.0


Only a small number of missing values **(480)** are found in the dataset compared to the total number of rows **(28520)**. Therefore, these missing rows are decided to be **removed** from the dataset, since their **impact is much smaller and negligible**

In [None]:
x_train = {}
x_valid = {}
x_test = {}

y_train = {}
y_valid = {}
y_test = {}

# Create dictionaries for each of the labels
for target_label in ['label_1','label_2','label_3','label_4']:

  # Select only rows with non-null values
  if target_label == "label_2":
    train = train_df[train_df['label_2'].notna()]
    valid = valid_df[valid_df['label_2'].notna()]

  else:
    train = train_df
    valid = valid_df

  test = test_df

  # A data preprocessing technique to scale and transform dataset features while minimizing sensitivity to outliers.
  scaler = RobustScaler()

  x_train[target_label] = pd.DataFrame(scaler.fit_transform(train.drop(['label_1','label_2','label_3','label_4'], axis=1)), columns=[f'feature_{i}' for i in range(1,769)])
  y_train[target_label] = train[target_label]

  x_valid[target_label] = pd.DataFrame(scaler.transform(valid.drop(['label_1','label_2','label_3','label_4'], axis=1)), columns=[f'feature_{i}' for i in range(1,769)])
  y_valid  [target_label] = valid[target_label]

  x_test[target_label] = pd.DataFrame(scaler.transform(test.drop(["ID"],axis=1)), columns=[f'feature_{i}' for i in range(1,769)])

# **Label 03**

In [None]:
# Create new copies
x_train_df = x_train['label_3'].copy()
y_train_df = y_train['label_3'].copy()

x_valid_df = x_valid['label_3'].copy()
y_valid_df = y_valid['label_3'].copy()

x_test_df = x_test['label_3'].copy()

This technique calculates the **mutual information** between the features and the target variable, selects features with a mutual information score >= **0.00002**, and creates new DataFrames with only the **selected features**.

In [None]:
# Calculate mutual information between features and the target variable
mi_scores = mutual_info_regression(x_train_df, y_train_df)

# Create a DataFrame to display the results
mi_df = pd.DataFrame({'Feature': x_train_df.columns, 'Mutual Information': mi_scores})

# Filter features based on the threshold
selected_features = mi_df[mi_df['Mutual Information'] >= 0.00002]['Feature']

# Create a new DataFrame with only the selected features
df_features_filtered = x_train_df[selected_features]
x_valid_df = x_valid_df[selected_features]
x_test_df = x_test_df[selected_features]

# Print the selected features
print("Selected Features: ")
print(df_features_filtered.head())


Selected Features: 
   feature_1  feature_2  feature_3  feature_5  feature_6  feature_7  \
0   0.577737  -0.254591   0.380138   0.005221  -0.996516  -0.053450   
1   1.058046   0.296661  -0.178071  -1.508886  -0.162940   0.621191   
2   0.576327   0.255669   0.531372  -0.872486   0.342303  -1.491110   
3   1.382169  -0.449728  -0.093503   0.143257  -0.318051   0.924434   
4   0.728630   1.673346  -0.405133   0.481175  -0.760472   0.182079   

   feature_8  feature_9  feature_10  feature_11  ...  feature_756  \
0  -0.741010  -0.405585    0.275523    0.465842  ...     0.200812   
1  -0.611861   0.889096    1.108491   -1.973027  ...    -0.608459   
2   0.370679  -0.424363   -0.841000    0.336066  ...    -0.922409   
3   1.132521   0.520832    1.385272    1.596740  ...    -0.796414   
4   0.787132  -0.775000   -0.767461   -1.457344  ...     0.188266   

   feature_757  feature_758  feature_761  feature_762  feature_763  \
0     1.857339     0.669453     0.791728    -1.091351     0.443727  

## **K-Fold Cross-Validation Approach**
This technique is used to assess machine learning model performance by dividing the dataset into K subsets, training the model K times, and averaging the results. It helps reduce the impact of data splitting variability, maximize data utilization, and useful for model selection.

In [None]:
# Perform 5-fold cross-validation Process
scores = cross_val_score(SVC(), df_features_filtered, y_train_df, cv=5, scoring='accuracy')

mean_accuracy = scores.mean()
std_accuracy = scores.std()

# Print the obtained cross-validation scores
print('Support Vector Machines')
print('\n')
print("Cross-validation scores: ", scores)
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_accuracy:.2f}")

Support Vector Machines


Cross-validation scores:  [0.98053997 0.99631837 0.98492286 0.97966339 0.99070827]
Mean Accuracy: 0.99
Standard Deviation: 0.01


## **Feature Selection - Principal Component Analysis (PCA)**
This is a common technique for feature selection and visualization in machine learning, helping to identify and focus on the most important patterns and relationships within the data.

This technique is applied here to improve the performance of the model by increasing its accuracy score. The number of best features obtained is **403**.

In [None]:
# Perform Principal Component Analysis (PCA) Approach
pca = PCA(n_components=0.975, svd_solver='full')
pca.fit(df_features_filtered)

# Create new dataframes by performing PCA transformation
x_train_df_pca = pd.DataFrame(pca.transform(df_features_filtered))
x_valid_df_pca = pd.DataFrame(pca.transform(x_valid_df))
x_test_df_pca = pd.DataFrame(pca.transform(x_test_df))

# Print the obtained shape after applying PCA approach
print('Shape after PCA: ',x_train_df_pca.shape)

Shape after PCA:  (28520, 403)


# **Build Classifier Models**
Both Support Vector Classifier model and Random Forest Classifier model are built, optimized and evaluated based on the accuracy score obtained to choose the best model

## **Support Vector Machine (SVM)**
Support Vector Machines (SVM) are effective for both linear and non-linear classification tasks. Support Vector Classification (SVC) is a variant of SVM used for classification tasks, where the goal is to separate data points into different classes using a hyperplane while maximizing the margin between the classes.

**SVC** model is built for the dataframe obtained after **PCA** approach. The accuracy score obtained here **(0.996)** is **slightly higher** than the accuracy score obtained on initial dataframe before feature selection **(0.99)**.

In [None]:
# Perform Support Vector classification (SVC) model approach
classifier = SVC(kernel='linear', C=1)
classifier.fit(x_train_df_pca, y_train_df)
y_valid_pred = classifier.predict(x_valid_df_pca)

# Print the obtained accuracy score after applying SVC approach
print("Accuracy Score: ",metrics.accuracy_score(y_valid_df, y_valid_pred))

# Make predictions on the test dataset
Prediction_SVC_PCA = classifier.predict(x_test_df_pca)

Accuracy Score:  0.996


SVC model is built for the dataframe obtained after **filtering features** based on mutual information. The accuracy score obtained here **(0.996)** is **slightly higher** than the accuracy score obtained on initial dataframe before feature selection **(0.99)**.

In [None]:
# Perform Support Vector classification (SVC) model approach
classifier.fit(df_features_filtered, y_train_df)
y_valid_pred = classifier.predict(x_valid_df)

# Print the obtained accuracy score after applying SVC approach
print("Accuracy Score: ",metrics.accuracy_score(y_valid_df, y_valid_pred))

Accuracy Score:  0.996


## **Random Forest Classifier Model**
This is a classifier model that combines multiple decision trees to make predictions. It is known for its high accuracy and robustness against overfitting.

Here, the **Random Forest Classifier** model is built on the dataframe after **filtering features** based on mutual information. The accuracy score obtained **(0.964)** is **much lower** than the value obtained from **Support Vector Classifier** model **(0.996)**.

In [None]:
# Perform Random Forest Classifier model on the dataframe obtained after filtering features using mutual information
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(df_features_filtered, y_train_df)
y_valid_pred = classifier.predict(x_valid_df)

# Print the obtained accuracy score after applying Random Forest approach
print("Accuracy Score: ",metrics.accuracy_score(y_valid_df, y_valid_pred))

# Make predictions on the test dataset
Prediction_RF = classifier.predict(x_test_df)

Accuracy Score:  0.964


## **Hyper Parameter Tuning** is **not performed** for this label since the **accuracy score** already reached its **maximum (0.996)**.

## **Output CSV File Generation**
The initial accuracy score is **(0.99)**. Best model chosen is **Support Vector Classifier Model with PCA approach**. Because the accuracy score obtained here **(0.996)** is **higher** than the score obtained from other models under various criteria. Therefore, predictions made by this model is used to create the output CSV file for submission.

In [None]:
# Create the output CSV file
output_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 9/solutions.csv')
output_df['label_3'] = Prediction_SVC_PCA
output_df.to_csv('/content/drive/MyDrive/Colab Notebooks/CS4622 - Machine Learning/ML Project/Layer 9/solutions.csv',index=False)