Define label names and feature names

In [None]:
import pandas as pd
import numpy as np

LABELS = ['label_1','label_2', 'label_3', 'label_4']
FEATURES = [f'feature_{i}' for i in range(1, 769)]


Read training,validation and test data

In [None]:
from google.colab import drive
drive.mount('/content/drive')


train_df = pd.read_csv("/content/drive/MyDrive/ML_Project_11/train.csv")
valid_df = pd.read_csv("/content/drive/MyDrive/ML_Project_11/valid.csv")
test_df = pd.read_csv("/content/drive/MyDrive/ML_Project_11/test.csv")


Mounted at /content/drive


Initialize dictionaries to store data

In [None]:
train_x = {}
valid_x = {}
test_x = {}
train_y = {}
valid_y = {}
test_y = {}

Prepare and preprocess the data. Here, rows with missing values for each label are dropped when train and validation dataframes are created.


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for LBL in LABELS:
  train_df_new = train_df[train_df[LBL].notna()]# dropping rows with miising values for label 2 column.
  valid_df_new = valid_df[valid_df[LBL].notna()]# dropping rows with miising values for label 2 column.
  train_x[LBL] = pd.DataFrame(scaler.fit_transform(train_df_new.drop(LABELS, axis=1)), columns = FEATURES)
  train_y[LBL] = train_df_new[LBL]
  valid_x[LBL] = pd.DataFrame(scaler.transform(valid_df_new.drop(LABELS, axis=1)), columns = FEATURES)
  valid_y[LBL] = valid_df_new[LBL]
  test_x[LBL] = pd.DataFrame(scaler.transform(test_df.drop(['ID'], axis=1)), columns=FEATURES)

  valid_df_new = valid_df[train_df[LBL].notna()]# dropping rows with miising values for label 2 column.
  valid_df_new = valid_df[train_df[LBL].notna()]# dropping rows with miising values for label 2 column.
  valid_df_new = valid_df[train_df[LBL].notna()]# dropping rows with miising values for label 2 column.
  valid_df_new = valid_df[train_df[LBL].notna()]# dropping rows with miising values for label 2 column.


Imports

In [None]:
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# For Label 1

In [None]:
LBL = 'label_1'

## Applying Feature Engineering techniques

### Using Principal Component Analysis(PCA)

Apply PCA on original features

In [None]:
pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(train_x[LBL])

In [None]:
train_x_trans = pd.DataFrame(pca.transform(train_x[LBL]))
valid_x_trans = pd.DataFrame(pca.transform(valid_x[LBL]))
test_x_trans = pd.DataFrame(pca.transform(test_x[LBL]))

print("Shape after feature reduction:", train_x_trans.shape)
print("Shape after feature reduction:", valid_x_trans.shape)
print("Shape after feature reduction:", test_x_trans.shape)

Shape after feature reduction: (28520, 386)
Shape after feature reduction: (750, 386)
Shape after feature reduction: (744, 386)


## Model Selection

### Cross Validation

For Support Vector Machine(SVM) classifier

In [None]:
svm_classifier = SVC(kernel='rbf')

# Define the number of splits for k-fold cross-validation
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(svm_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.89446003 0.90147265 0.9042777  0.89901823 0.90182328 0.90252454
 0.89165498 0.89481066 0.89726508 0.8930575 ]
Mean accuracy: 0.8980364656381488


For kNN classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.79593268 0.79172511 0.79067321 0.79978962 0.79523142 0.79698457
 0.78541374 0.77664797 0.79908836 0.79172511]
Mean accuracy: 0.7923211781206172


Since SVM has the highest mean accuracy, SVM classifier is selected as the classification model

## Hyperparameter tuning

### Using Grid search

In [None]:
# Define the parameter grid for grid search
param_grid = {
    'C' : [0.1, 1, 10],
    'kernel': ['rbf','linear','poly']
}

# Create the SVM model
svm_model = SVC()

# Create Grid Search object
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy')

# Fit the model
grid_search.fit(train_x_trans, train_y[LBL])

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 10, 'kernel': 'rbf'}


C = 10 and kernel = 'rbf' are used for training the SVM model

## Train, Evaluation and Prediction

Train the SVM classifier


In [None]:
classifier = svm.SVC(kernel='rbf',C=10)
classifier.fit(train_x_trans, train_y[LBL])

Prediction and evaluation for valid set.

In [None]:
y_predict_valid = classifier.predict(valid_x_trans)
print(classification_report(valid_y[LBL], y_predict_valid))

              precision    recall  f1-score   support

           1       0.93      1.00      0.96        13
           2       1.00      0.89      0.94         9
           3       0.86      1.00      0.92        12
           4       0.93      0.88      0.90        16
           5       0.94      0.83      0.88        18
           6       1.00      0.89      0.94         9
           7       0.89      0.94      0.91        17
           8       0.92      0.86      0.89        14
           9       1.00      0.91      0.95        11
          10       1.00      1.00      1.00         8
          11       1.00      1.00      1.00        19
          12       1.00      1.00      1.00         7
          13       0.91      0.91      0.91        11
          14       1.00      0.93      0.97        15
          15       0.94      0.88      0.91        17
          16       1.00      1.00      1.00        14
          17       1.00      0.93      0.96        14
          18       1.00    

Prediction for test dataset

In [None]:
test_y[LBL] = classifier.predict(test_x_trans)

# For Label 2

In [None]:
LBL = 'label_2'

## Applying Feature Engineering techniques

### Using Principal Component Analysis(PCA)



Apply PCA on original features

In [None]:
pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(train_x[LBL])

In [None]:
train_x_trans = pd.DataFrame(pca.transform(train_x[LBL]))
valid_x_trans = pd.DataFrame(pca.transform(valid_x[LBL]))
test_x_trans = pd.DataFrame(pca.transform(test_x[LBL]))

print("Shape after feature reduction:", train_x_trans.shape)
print("Shape after feature reduction:", valid_x_trans.shape)
print("Shape after feature reduction:", test_x_trans.shape)

Shape after feature reduction: (28040, 385)
Shape after feature reduction: (270, 385)
Shape after feature reduction: (744, 385)


## Model Selection

### Cross Validation

For SVM classifier

In [None]:
svm_classifier = SVC(kernel='rbf')

# Define the number of splits for k-fold cross-validation
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(svm_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.82667618 0.80563481 0.80563481 0.82275321 0.81134094 0.81669044
 0.79957204 0.79671897 0.81811698 0.81241084]
Mean accuracy: 0.8115549215406563


For kNN classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=10)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.80527817 0.80099857 0.79315264 0.80563481 0.79350927 0.79992867
 0.7853067  0.79136947 0.79386591 0.80278174]
Mean accuracy: 0.7971825962910128


For kNN Regressor

In [None]:
knn_Regressor = KNeighborsRegressor(n_neighbors=10)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_Regressor, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.76794367 0.76566956 0.6868348  0.73104268 0.79017328 0.74575592
 0.75141086 0.72771798 0.73703345 0.7025901 ]
Mean accuracy: 0.740617230505225


For Random Forest classifier

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
# Perform k-fold cross-validation
cv_scores = cross_val_score(random_forest_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.59522111 0.58059914 0.60877318 0.60805991 0.60128388 0.60271041
 0.57774608 0.59058488 0.58987161 0.58059914]
Mean accuracy: 0.5935449358059914


Since SVM has the highest mean accuracy, SVM classifier is selected as the classification model

## Hyperparameter tuning

### Using Grid search

In [None]:
# Define the parameter grid for grid search
param_grid = {
    'C': [100,1000,10000],
    'kernel': ['rbf','linear']
}

# Create the SVM model
svm_model = SVC()

# Create Grid Search object
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy')

# Fit the model
grid_search.fit(train_x_trans, train_y[LBL])

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

C = 1000 and kernel = 'rbf' are used for training the SVM model

## Train, Evaluation and Prediction

Train the SVC classifier


In [None]:
train_x_trans = train_x[LBL]
valid_x_trans = valid_x[LBL]
test_x_trans = test_x[LBL]

In [None]:
classifier = svm.SVC(kernel='rbf',C =1000)
classifier.fit(train_x_trans, train_y[LBL])

In [None]:
classifier = svm.SVC(kernel='rbf',C =10)
classifier.fit(train_x_trans, train_y[LBL])

Prediction and evaluation for valid set.

In [None]:
y_predict_valid = classifier.predict(valid_x_trans)
print(classification_report(valid_y[LBL], y_predict_valid))

              precision    recall  f1-score   support

        22.0       1.00      0.86      0.92         7
        23.0       0.71      0.80      0.75        15
        24.0       0.93      1.00      0.96        13
        25.0       0.90      0.95      0.92        19
        26.0       0.89      0.93      0.91        43
        27.0       0.97      0.97      0.97        29
        28.0       0.92      0.88      0.90        25
        29.0       1.00      1.00      1.00        25
        30.0       1.00      0.95      0.97        37
        31.0       0.91      0.94      0.92        32
        35.0       1.00      0.91      0.95        11
        41.0       1.00      0.86      0.92        14

    accuracy                           0.93       270
   macro avg       0.93      0.92      0.93       270
weighted avg       0.93      0.93      0.93       270



In [None]:
y_predict_valid = classifier.predict(valid_x_trans)
print(classification_report(valid_y[LBL], y_predict_valid))

              precision    recall  f1-score   support

        22.0       1.00      0.86      0.92         7
        23.0       0.62      0.87      0.72        15
        24.0       0.87      1.00      0.93        13
        25.0       0.94      0.79      0.86        19
        26.0       0.89      0.95      0.92        43
        27.0       0.93      0.93      0.93        29
        28.0       0.96      0.88      0.92        25
        29.0       1.00      0.96      0.98        25
        30.0       1.00      0.95      0.97        37
        31.0       0.88      0.94      0.91        32
        35.0       1.00      0.91      0.95        11
        41.0       1.00      0.79      0.88        14

    accuracy                           0.91       270
   macro avg       0.92      0.90      0.91       270
weighted avg       0.93      0.91      0.92       270



Prediction for test dataset

In [None]:
test_y[LBL] = classifier.predict(test_x_trans)

In [None]:
test_y[LBL] = classifier.predict(test_x_trans)

# For Label 3

In [None]:
LBL = 'label_3'

## Applying Feature Engineering techniques

### Using Principal Component Analysis(PCA)



Apply PCA on original features

In [None]:
pca = PCA(n_components=0.98, svd_solver='full')
pca.fit(train_x[LBL])

In [None]:
train_x_trans = pd.DataFrame(pca.transform(train_x[LBL]))
valid_x_trans = pd.DataFrame(pca.transform(valid_x[LBL]))
test_x_trans = pd.DataFrame(pca.transform(test_x[LBL]))

print("Shape after feature reduction:", train_x_trans.shape)
print("Shape after feature reduction:", valid_x_trans.shape)
print("Shape after feature reduction:", test_x_trans.shape)

Shape after feature reduction: (28520, 282)
Shape after feature reduction: (750, 282)
Shape after feature reduction: (744, 282)


## Model Selection

### Cross Validation

For SVM classifier

In [None]:
svm_classifier = SVC(kernel='rbf')

# Define the number of splits for k-fold cross-validation
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(svm_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.9898317  0.99123422 0.99018233 0.99298738 0.99053296 0.98842917
 0.99263675 0.98772791 0.99053296 0.99123422]
Mean accuracy: 0.9905329593267883


For kNN classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.98036466 0.98246844 0.97370266 0.97650771 0.98281907 0.97685835
 0.97931276 0.97159888 0.9730014  0.97545582]
Mean accuracy: 0.9772089761570827


For XGBoost classifier

In [None]:
xgb_classifier = xgb.XGBClassifier()

cv_scores = cross_val_score(xgb_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.98667602 0.98597475 0.98597475 0.98316971 0.98913043 0.98352034
 0.98702665 0.98562412 0.98352034 0.98281907]
Mean accuracy: 0.9853436185133239


Since SVM has the highest mean accuracy, SVM classifier is selected as the classification model

## Hyperparameter tuning

### Using Grid search

In [None]:
# Define the parameter grid for grid search
param_grid = {
    'C' : [1, 10],
    'kernel': ['rbf','linear','poly']
}

# Create the SVM model
svm_model = SVC()

# Create Grid Search object
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy')

# Fit the model
grid_search.fit(train_x_trans, train_y[LBL])

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 10, 'kernel': 'rbf'}


C = 10 and kernel = 'rbf' are used for training the SVM model

## Train, Evaluation and Prediction

Train the SVM classifier


In [None]:
classifier = svm.SVC(kernel='rbf', C = 10)
classifier.fit(train_x_trans, train_y[LBL])

Prediction and evaluation for valid set.

In [None]:
y_predict_valid = classifier.predict(valid_x_trans)
print(classification_report(valid_y[LBL], y_predict_valid))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       142
           1       1.00      1.00      1.00       608

    accuracy                           1.00       750
   macro avg       0.99      0.99      0.99       750
weighted avg       1.00      1.00      1.00       750



Prediction for test dataset

In [None]:
test_y[LBL] = classifier.predict(test_x_trans)

# For Label 4

In [None]:
LBL = 'label_4'

## Applying Feature Engineering techniques

### Using Principal Component Analysis(PCA)



Apply PCA on original features

In [None]:
pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(train_x[LBL])

In [None]:
train_x_trans = pd.DataFrame(pca.transform(train_x[LBL]))
valid_x_trans = pd.DataFrame(pca.transform(valid_x[LBL]))
test_x_trans = pd.DataFrame(pca.transform(test_x[LBL]))

print("Shape after feature reduction:", train_x_trans.shape)
print("Shape after feature reduction:", valid_x_trans.shape)
print("Shape after feature reduction:", test_x_trans.shape)

Shape after feature reduction: (28520, 386)
Shape after feature reduction: (750, 386)
Shape after feature reduction: (744, 386)


## Model Selection

### Cross Validation

For SVM classifier

In [None]:
svm_classifier = SVC(kernel='rbf')

# Define the number of splits for k-fold cross-validation
k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(svm_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.89726508 0.8997195  0.88920056 0.90112202 0.90813464 0.89761571
 0.90007013 0.89936886 0.90112202 0.88990182]
Mean accuracy: 0.898352033660589


For kNN classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Perform k-fold cross-validation
cv_scores = cross_val_score(knn_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")

Cross-validation scores: [0.90287518 0.90462833 0.90147265 0.914446   0.91409537 0.91654979
 0.90462833 0.90077139 0.91409537 0.90077139]
Mean accuracy: 0.9074333800841515


For Random Forest classifier

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(random_forest_classifier, train_x_trans, train_y[LBL], cv=kf)

# Print the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")


Cross-validation scores: [0.69775596 0.70967742 0.70336606 0.7173913  0.72615708 0.72019635
 0.71774194 0.71248247 0.71563815 0.69915849]
Mean accuracy: 0.7119565217391305


Since SVM has the highest mean accuracy, SVM classifier is selected as the classification model

## Hyperparameter tuning

### Using Grid search

In [None]:
# Define the parameter grid for grid search
param_grid = {
    'C': [1, 10, 100],
    'kernel': ['rbf','linear','poly']
}

# Create the SVM model
svm_model = SVC()

# Create Grid Search object
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy')

# Fit the model
grid_search.fit(train_x_trans, train_y[LBL])

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 10, 'kernel': 'rbf'}


C = 10 and kernel = 'rbf' are used for training the SVM model

## Train, Evaluation and Prediction

Train the SVM classifier


In [None]:
classifier = svm.SVC(kernel='rbf', C = 10)
classifier.fit(train_x_trans, train_y[LBL])

Prediction and evaluation for valid set.

In [None]:
y_predict_valid = classifier.predict(valid_x_trans)
print(classification_report(valid_y[LBL], y_predict_valid))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       1.00      0.91      0.95        11
           2       1.00      0.93      0.96        27
           3       1.00      1.00      1.00         8
           4       0.90      0.60      0.72        15
           5       0.91      0.91      0.91        11
           6       0.95      1.00      0.97       532
           7       1.00      0.88      0.93        32
           8       1.00      0.84      0.91        19
           9       1.00      0.82      0.90        17
          10       1.00      0.90      0.95        10
          11       1.00      0.91      0.95        11
          12       1.00      0.81      0.89        26
          13       1.00      0.80      0.89        10

    accuracy                           0.96       750
   macro avg       0.98      0.88      0.92       750
weighted avg       0.96      0.96      0.96       750



Prediction for test dataset

In [None]:
test_y[LBL] = classifier.predict(test_x_trans)

In [None]:
IDs = [i for i in range(1, 745)]
output_df = pd.DataFrame({
    'ID': IDs,
})
for l in LABELS:
  lbl_df = pd.DataFrame({l : test_y[l]})
  output_df = pd.concat([output_df, lbl_df], axis=1)


In [None]:
output_df.to_csv('/content/drive/MyDrive/ML_Project_11/output.csv', index=False)