Define label names and feature names

In [1]:
import pandas as pd
import numpy as np

LABELS = ['label_1','label_2', 'label_3', 'label_4']
FEATURES = [f'feature_{i}' for i in range(1, 769)]


Read training,validation and test data

In [None]:
from google.colab import drive
drive.mount('/content/drive')


train_df = pd.read_csv("/content/drive/MyDrive/ML_Project/train.csv")
valid_df = pd.read_csv("/content/drive/MyDrive/ML_Project/valid.csv")
test_df = pd.read_csv("/content/drive/MyDrive/ML_Project/test.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Initialize dictionaries to store data

In [None]:
train_x = {}
valid_x = {}
test_x = {}
train_y = {}
valid_y = {}
test_y = {}

Prepare and preprocess the data. Here, rows with missing values for each label are dropped when train and validation dataframes are created.


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for LBL in LABELS:
  train_df_new = train_df[train_df[LBL].notna()]
  valid_df_new = valid_df[valid_df[LBL].notna()]
  train_x[LBL] = pd.DataFrame(scaler.fit_transform(train_df_new.drop(LABELS, axis=1)), columns = FEATURES)
  train_y[LBL] = train_df_new[LBL]
  valid_x[LBL] = pd.DataFrame(scaler.transform(valid_df_new.drop(LABELS, axis=1)), columns = FEATURES)
  valid_y[LBL] = valid_df_new[LBL]
  test_x[LBL] = pd.DataFrame(scaler.transform(test_df.drop(['ID'], axis=1)), columns=FEATURES)

  valid_df_new = valid_df[train_df[LBL].notna()]
  valid_df_new = valid_df[train_df[LBL].notna()]
  valid_df_new = valid_df[train_df[LBL].notna()]
  valid_df_new = valid_df[train_df[LBL].notna()]


Imports

In [None]:
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report



# For Label 1

In [None]:
LBL = 'label_1'

## Applying Feature Engineering techniques

### Using Principal Component Analysis(PCA)



Apply PCA on original features

In [None]:
pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(train_x[LBL])

In [None]:
train_x_trans = pd.DataFrame(pca.transform(train_x[LBL]))
valid_x_trans = pd.DataFrame(pca.transform(valid_x[LBL]))
test_x_trans = pd.DataFrame(pca.transform(test_x[LBL]))

print("Shape after feature reduction:", train_x_trans.shape)
print("Shape after feature reduction:", valid_x_trans.shape)
print("Shape after feature reduction:", test_x_trans.shape)

Shape after feature reduction: (28520, 572)
Shape after feature reduction: (750, 572)
Shape after feature reduction: (744, 572)


Number of features are reduced to 572.

## Model Selection

### Cross Validation

For Support Vector Machine(SVM) classifier

In [None]:
svm_classifier = SVC(kernel='rbf')

# Define the number of splits for k-fold cross-validation
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(svm_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.93723703 0.93899018 0.93232819 0.93338008 0.93267882 0.93723703
 0.9312763  0.94109397 0.93443198 0.93267882]
Mean accuracy: 0.9351332398316972


For kNN classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.82573633 0.83660589 0.83204769 0.8411641  0.83520337 0.83695652
 0.82643759 0.83906031 0.83590463 0.82748948]
Mean accuracy: 0.8336605890603087


Since SVM has the highest mean accuracy, SVM classifier is selected as the classification model

## Hyperparameter tuning

### Using Grid search

In [None]:
# Define the parameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf','linear','poly']
}

# Create the SVM model
svm_model = SVC()

# Create Grid Search object
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy')

# Fit the model
grid_search.fit(train_x_trans, train_y[LBL])

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 10, 'kernel': 'rbf'}


C = 10 and kernel = 'rbf' are used for training the SVM model

## Train, Evaluation and Prediction

Train the SVM classifier


In [None]:
classifier = svm.SVC(kernel='rbf',C=10)
classifier.fit(train_x_trans, train_y[LBL])

Prediction and evaluation for valid set.

In [None]:
y_predict_valid = classifier.predict(valid_x_trans)
print(classification_report(valid_y[LBL], y_predict_valid))

              precision    recall  f1-score   support

           1       0.92      0.92      0.92        13
           2       1.00      1.00      1.00         9
           3       1.00      1.00      1.00        12
           4       1.00      1.00      1.00        16
           5       1.00      0.94      0.97        18
           6       1.00      1.00      1.00         9
           7       0.89      0.94      0.91        17
           8       1.00      0.86      0.92        14
           9       1.00      0.91      0.95        11
          10       1.00      1.00      1.00         8
          11       0.95      1.00      0.97        19
          12       1.00      1.00      1.00         7
          13       1.00      0.91      0.95        11
          14       0.93      0.93      0.93        15
          15       1.00      0.88      0.94        17
          16       1.00      0.93      0.96        14
          17       1.00      1.00      1.00        14
          18       0.95    

Prediction for test dataset

In [None]:
test_y[LBL] = classifier.predict(test_x_trans)

# For Label 2

In [None]:
LBL = 'label_2'

## Applying Feature Engineering techniques

### Using Principal Component Analysis(PCA)



Apply PCA on original features

In [None]:
pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(train_x[LBL])

In [None]:
train_x_trans = pd.DataFrame(pca.transform(train_x[LBL]))
valid_x_trans = pd.DataFrame(pca.transform(valid_x[LBL]))
test_x_trans = pd.DataFrame(pca.transform(test_x[LBL]))

print("Shape after feature reduction:", train_x_trans.shape)
print("Shape after feature reduction:", valid_x_trans.shape)
print("Shape after feature reduction:", test_x_trans.shape)

Shape after feature reduction: (28040, 571)
Shape after feature reduction: (270, 571)
Shape after feature reduction: (744, 571)


Number of features are reduced to 571.

## Model Selection

### Cross Validation

For Support Vector Machine(SVM) classifier

In [None]:
svm_classifier = SVC(kernel='rbf')

# Define the number of splits for k-fold cross-validation
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(svm_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.89300999 0.88801712 0.88445078 0.89728959 0.88552068 0.89586305
 0.89265335 0.88266762 0.89443652 0.89158345]
Mean accuracy: 0.890549215406562


For kNN classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=10)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.83523538 0.83666191 0.83523538 0.84343795 0.83844508 0.83024251
 0.83202568 0.81990014 0.83238231 0.83701854]
Mean accuracy: 0.834058487874465


For kNN Regressor

In [None]:
knn_Regressor = KNeighborsRegressor(n_neighbors=10)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_Regressor, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.77794691 0.76339131 0.70736884 0.7540827  0.76570915 0.75829413
 0.80219981 0.75171115 0.80157323 0.76069942]
Mean accuracy: 0.7642976670032424


For Random Forest classifier

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
# Perform k-fold cross-validation
cv_scores = cross_val_score(random_forest_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.51141227 0.51355207 0.521398   0.53994294 0.53887304 0.51854494
 0.5171184  0.50891583 0.5353067  0.51426534]
Mean accuracy: 0.5219329529243937


Since SVM has the highest mean accuracy, SVM classifier is selected as the classification model

## Hyperparameter tuning

### Using Grid search

In [None]:
# Define the parameter grid for grid search
param_grid = {
    'C': [1,10,100],
    'kernel': ['rbf','linear','poly']
}

# Create the SVM model
svm_model = SVC()

# Create Grid Search object
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy')

# Fit the model
grid_search.fit(train_x_trans, train_y[LBL])

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 10, 'kernel': 'rbf'}


C = 10 and kernel = 'rbf' are used for training the SVM model

## Train, Evaluation and Prediction

Train the SVM classifier


In [None]:
classifier = svm.SVC(kernel='rbf',C =10)
classifier.fit(train_x_trans, train_y[LBL])

Prediction and evaluation for valid set.

In [None]:
y_predict_valid = classifier.predict(valid_x_trans)
print(classification_report(valid_y[LBL], y_predict_valid))

              precision    recall  f1-score   support

        22.0       0.88      1.00      0.93         7
        23.0       0.72      0.87      0.79        15
        24.0       0.93      1.00      0.96        13
        25.0       0.76      0.68      0.72        19
        26.0       0.91      0.98      0.94        43
        27.0       0.93      0.86      0.89        29
        28.0       0.92      0.88      0.90        25
        29.0       1.00      0.88      0.94        25
        30.0       0.95      0.97      0.96        37
        31.0       0.91      0.97      0.94        32
        35.0       1.00      0.91      0.95        11
        41.0       1.00      0.86      0.92        14

    accuracy                           0.91       270
   macro avg       0.91      0.90      0.90       270
weighted avg       0.91      0.91      0.91       270



Prediction for test dataset

In [None]:
test_y[LBL] = classifier.predict(test_x_trans)

# For Label 3

In [None]:
LBL = 'label_3'

## Applying Feature Engineering techniques

### Using Principal Component Analysis(PCA)

Apply PCA on original features

In [None]:
pca = PCA(n_components=0.85, svd_solver='full')
pca.fit(train_x[LBL])

In [None]:
train_x_trans = pd.DataFrame(pca.transform(train_x[LBL]))
valid_x_trans = pd.DataFrame(pca.transform(valid_x[LBL]))
test_x_trans = pd.DataFrame(pca.transform(test_x[LBL]))

print("Shape after feature reduction:", train_x_trans.shape)
print("Shape after feature reduction:", valid_x_trans.shape)
print("Shape after feature reduction:", test_x_trans.shape)

Shape after feature reduction: (28520, 120)
Shape after feature reduction: (750, 120)
Shape after feature reduction: (744, 120)


Number of features are reduced to 120.

## Model Selection

### Cross Validation

For Support Vector Machine(SVM) classifier

In [None]:
svm_classifier = SVC(kernel='rbf')

# Define the number of splits for k-fold cross-validation
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(svm_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.99368864 0.9898317  0.99053296 0.9943899  0.9943899  0.98948107
 0.99158485 0.98807854 0.9898317  0.99298738]
Mean accuracy: 0.9914796633941092


For kNN classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.97791024 0.97615708 0.97335203 0.98141655 0.97545582 0.97791024
 0.97685835 0.97545582 0.97826087 0.97826087]
Mean accuracy: 0.9771037868162692


For Logistic Regression

In [None]:
logistic_regression_classifier = LogisticRegression(max_iter=10000)

cv_scores = cross_val_score(logistic_regression_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.99158485 0.98842917 0.98702665 0.98913043 0.9898317  0.98702665
 0.98492286 0.98667602 0.98702665 0.98842917]
Mean accuracy: 0.9880084151472651


For XGBoost classifier

In [None]:
xgb_classifier = xgb.XGBClassifier()

cv_scores = cross_val_score(xgb_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.98141655 0.97826087 0.98281907 0.98316971 0.98352034 0.98106592
 0.98246844 0.98246844 0.98176718 0.98281907]
Mean accuracy: 0.9819775596072932


Since SVM has the highest mean accuracy, SVM classifier is selected as the classification model

## Hyperparameter tuning

### Using Grid search

In [None]:
# Define the parameter grid for grid search
param_grid = {
    'C' : [0.1, 1, 10],
    'kernel': ['rbf','linear','poly']
}

# Create the SVM model
svm_model = SVC()

# Create Grid Search object
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy')

# Fit the model
grid_search.fit(train_x_trans, train_y[LBL])

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 10, 'kernel': 'rbf'}


C = 10 and kernel = 'rbf' is used for training the SVM model

## Train, Evaluation and Prediction

Train the SVM classifier


In [None]:
classifier = svm.SVC(kernel='rbf', C = 10)
classifier.fit(train_x_trans, train_y[LBL])

Prediction and evaluation for valid set.

In [None]:
y_predict_valid = classifier.predict(valid_x_trans)
print(classification_report(valid_y[LBL], y_predict_valid))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       142
           1       1.00      1.00      1.00       608

    accuracy                           1.00       750
   macro avg       0.99      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750



Prediction for test dataset

In [None]:
test_y[LBL] = classifier.predict(test_x_trans)

# For Label 4

In [None]:
LBL = 'label_4'

## Applying Feature Engineering techniques

### Using Principal Component Analysis(PCA)


Apply PCA on original features

In [None]:
pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(train_x[LBL])

In [None]:
train_x_trans = pd.DataFrame(pca.transform(train_x[LBL]))
valid_x_trans = pd.DataFrame(pca.transform(valid_x[LBL]))
test_x_trans = pd.DataFrame(pca.transform(test_x[LBL]))

print("Shape after feature reduction:", train_x_trans.shape)
print("Shape after feature reduction:", valid_x_trans.shape)
print("Shape after feature reduction:", test_x_trans.shape)

Shape after feature reduction: (28520, 572)
Shape after feature reduction: (750, 572)
Shape after feature reduction: (744, 572)


Number of features are reduced to 572.

## Model Selection

### Cross Validation

For Support Vector Machine(SVM) classifier

In [None]:
svm_classifier = SVC(kernel='rbf')

# Define the number of splits for k-fold cross-validation
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(svm_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.94319776 0.94635344 0.94074334 0.95056101 0.95056101 0.94600281
 0.94600281 0.9470547  0.94915849 0.94249649]
Mean accuracy: 0.9462131837307155


For kNN classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.91690042 0.93197756 0.92356241 0.92952314 0.9302244  0.92671809
 0.92496494 0.92987377 0.92882188 0.9200561 ]
Mean accuracy: 0.9262622720897615


For Random Forest classifier

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(random_forest_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.69565217 0.707223   0.70021038 0.71388499 0.72265077 0.71423562
 0.71178121 0.71072931 0.71037868 0.69670407]
Mean accuracy: 0.7083450210378681


Since SVM has the highest mean accuracy, SVM classifier is selected as the classification model

## Hyperparameter tuning

### Using Grid search

In [None]:
# Define the parameter grid for grid search
param_grid = {
    'C': [1, 10, 100],
    'kernel': ['rbf','linear','poly']
}

# Create the SVM model
svm_model = SVC()

# Create Grid Search object
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy')

# Fit the model
grid_search.fit(train_x_trans, train_y[LBL])

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 10, 'kernel': 'rbf'}


C = 10 and kernel = 'rbf' are used for training the SVM model

## Train, Evaluation and Prediction

*Train* the SVM classifier


In [None]:
classifier = svm.SVC(kernel='rbf', C = 10)
classifier.fit(train_x_trans, train_y[LBL])

Prediction and evaluation for valid set.

In [None]:
y_predict_valid = classifier.predict(valid_x_trans)
print(classification_report(valid_y[LBL], y_predict_valid))

              precision    recall  f1-score   support

           0       1.00      0.90      0.95        21
           1       1.00      0.91      0.95        11
           2       1.00      1.00      1.00        27
           3       1.00      1.00      1.00         8
           4       1.00      0.80      0.89        15
           5       1.00      0.82      0.90        11
           6       0.97      1.00      0.98       532
           7       1.00      0.94      0.97        32
           8       0.94      0.84      0.89        19
           9       1.00      0.82      0.90        17
          10       1.00      0.90      0.95        10
          11       1.00      0.91      0.95        11
          12       1.00      0.92      0.96        26
          13       1.00      1.00      1.00        10

    accuracy                           0.97       750
   macro avg       0.99      0.91      0.95       750
weighted avg       0.97      0.97      0.97       750



Prediction for test dataset

In [None]:
test_y[LBL] = classifier.predict(test_x_trans)

In [None]:
IDs = [i for i in range(1, 745)]
output_df = pd.DataFrame({
    'ID': IDs,
})
for l in LABELS:
  lbl_df = pd.DataFrame({l : test_y[l]})
  output_df = pd.concat([output_df, lbl_df], axis=1)


In [None]:
output_df.to_csv('/content/drive/MyDrive/ML_Project/output.csv', index=False)