# Loading datasets 

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np
from sklearn.svm import SVC

# Load datasets
print("[info] : Loading datasets..")
train_data = pd.read_csv("train.csv")
test_data=pd.read_csv("test.csv")
test_data.dropna(inplace=True)
validation_data=pd.read_csv("valid.csv")
print("[info] : Datasets loading complete.")

[info] : Loading datasets..
[info] : Datasets loading complete.


# Label 1

## Data preprocessing

In [2]:
print("[info] : Preprocessing data..")
# Prepare data for label 1
label_1_train_data=train_data.drop(columns=["label_2","label_3","label_4"])
label_1_valid_data=validation_data.drop(columns=["label_2","label_3","label_4"])

# Drop rows with missing values
label_1_train_data.dropna(inplace=True)
label_1_valid_data.dropna(inplace=True)

# Separate features and labels
label_1_train_x=label_1_train_data.drop(columns=["label_1"])
label_1_train_y=label_1_train_data["label_1"]
label_1_valid_x=label_1_valid_data.drop(columns=["label_1"])
label_1_valid_y=label_1_valid_data["label_1"]
print("[info] : Preprocessing complete.")

[info] : Preprocessing data..
[info] : Preprocessing complete.


## Apply PCA

In [3]:
print("[info] : Reducing dimensions using PCA..")
no_of_components=300
pca=PCA(n_components=no_of_components)
transformed_label_1_train_x = pca.fit_transform(label_1_train_x)
transformed_label_1_valid_x = pca.fit_transform(label_1_valid_x)
print("[info] : Dimensions reduction complete.")

[info] : Reducing dimensions using PCA..
[info] : Dimensions reduction complete.


## Train the classifier

In [4]:
# Choose Random Forest as the classifier
# classifier = RandomForestClassifier()

# Choose SVM as the classifier
classifier = SVC()
param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.001, 0.01]
}
grid_search = GridSearchCV(classifier, param_grid, cv=5, n_jobs=-1,verbose=5)

print("[info] : Tuning hyper parameters..")
# Tune hyper parameters
grid_search.fit(transformed_label_1_train_x, label_1_train_y)
print("[info] : Hyper parameters tuning complete.")

# Get the best hyperparameters from the grid search
best_clf = grid_search.best_estimator_

print("[info] : Cross validating trained model..")
# Use Cross-Validation to Validate the Results
cv_scores = cross_val_score(best_clf, transformed_label_1_valid_x, label_1_valid_y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))
print("Cross validation complete")

[info] : Tuning hyper parameters..
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[info] : Hyper parameters tuning complete.
[info] : Cross validating trained model..
Cross-Validation Scores: [0.44       0.45333333 0.52       0.49333333 0.54666667]
Mean CV Score: 0.49066666666666664
Cross validation complete


## Predictions

In [30]:
# Load Test Set and Predict Values
print("Predicting for test data for label 1..")
label_1_test_data=test_data.drop(columns=['ID'])
transformed_label_1_test_x= pca.transform(label_1_test_data)
label_1_y_pred = list(best_clf.predict(transformed_label_1_test_x))
print("Prediction complete.")
print(len(label_1_y_pred))

Predicting for test data for label 1..
Prediction complete.
744


# Label 2

## Data preprocessing

In [6]:
print("[info] : Preprocessing data..")
# Prepare data for label 2
label_2_train_data=train_data.drop(columns=["label_1","label_3","label_4"])
label_2_valid_data=validation_data.drop(columns=["label_1","label_3","label_4"])

# Drop rows with missing values
label_2_train_data.dropna(inplace=True)
label_2_valid_data.dropna(inplace=True)

# Separate features and labels
label_2_train_x=label_2_train_data.drop(columns=["label_2"])
label_2_train_y=label_2_train_data["label_2"]
label_2_valid_x=label_2_valid_data.drop(columns=["label_2"])
label_2_valid_y=label_2_valid_data["label_2"]
print("[info] : Preprocessing complete.")

[info] : Preprocessing data..
[info] : Preprocessing complete.


## Apply PCA

In [7]:
print("[info] : Reducing dimensions using PCA..")
no_of_components=300
pca=PCA(n_components=no_of_components)
transformed_label_2_train_x = pca.fit_transform(label_2_train_x)
transformed_label_2_valid_x = pca.fit_transform(label_2_valid_x)
print("[info] : Dimensions reduction complete.")

[info] : Reducing dimensions using PCA..
[info] : Dimensions reduction complete.


## Train the classifier

In [8]:
# Choose SVM as the classifier
classifier = SVC()
param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.001, 0.01]
}
grid_search = GridSearchCV(classifier, param_grid, cv=5, n_jobs=-1,verbose=5)

print("[info] : Tuning hyper parameters..")
# Tune hyper parameters
grid_search.fit(transformed_label_2_train_x, label_2_train_y)
print("[info] : Hyper parameters tuning complete.")

# Get the best hyperparameters from the grid search
best_clf = grid_search.best_estimator_

print("[info] : Cross validating trained model..")
# Use Cross-Validation to Validate the Results
cv_scores = cross_val_score(best_clf, transformed_label_2_valid_x, label_2_valid_y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))
print("Cross validation complete")

[info] : Tuning hyper parameters..
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[info] : Hyper parameters tuning complete.
[info] : Cross validating trained model..
Cross-Validation Scores: [0.26351351 0.29931973 0.28571429 0.31972789 0.16326531]
Mean CV Score: 0.26630814487957344
Cross validation complete


## Predictions

In [31]:
# Load Test Set and Predict Values
print("Predicting for test data for label 2..")
label_2_test_data=test_data.drop(columns=['ID'])
transformed_label_2_test_x= pca.transform(label_2_test_data)
label_2_y_pred = list(best_clf.predict(transformed_label_2_test_x).astype(int))
print("Prediction complete.")
print(len(label_2_y_pred))

Predicting for test data for label 2..
Prediction complete.
744


# Label 3

## Data preprocessing

In [10]:
print("[info] : Preprocessing data..")
# Prepare data for label 3
label_3_train_data=train_data.drop(columns=["label_1","label_2","label_4"])
label_3_valid_data=validation_data.drop(columns=["label_1","label_2","label_4"])

# Drop rows with missing values
label_3_train_data.dropna(inplace=True)
label_3_valid_data.dropna(inplace=True)

# Separate features and labels
label_3_train_x=label_3_train_data.drop(columns=["label_3"])
label_3_train_y=label_3_train_data["label_3"]
label_3_valid_x=label_3_valid_data.drop(columns=["label_3"])
label_3_valid_y=label_3_valid_data["label_3"]
print("[info] : Preprocessing complete.")

[info] : Preprocessing data..
[info] : Preprocessing complete.


## Apply PCA

In [11]:
print("[info] : Reducing dimensions using PCA..")
no_of_components=300
pca=PCA(n_components=no_of_components)
transformed_label_3_train_x = pca.fit_transform(label_3_train_x)
transformed_label_3_valid_x = pca.fit_transform(label_3_valid_x)
print("[info] : Dimensions reduction complete.")

[info] : Reducing dimensions using PCA..
[info] : Dimensions reduction complete.


## Train the classifier

In [12]:
# Choose SVM as the classifier
classifier = SVC()
param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.001, 0.01]
}
grid_search = GridSearchCV(classifier, param_grid, cv=5, n_jobs=-1,verbose=5)

print("[info] : Tuning hyper parameters..")
# Tune hyper parameters
grid_search.fit(transformed_label_3_train_x, label_3_train_y)
print("[info] : Hyper parameters tuning complete.")

# Get the best hyperparameters from the grid search
best_clf = grid_search.best_estimator_

print("[info] : Cross validating trained model..")
# Use Cross-Validation to Validate the Results
cv_scores = cross_val_score(best_clf, transformed_label_3_valid_x, label_3_valid_y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))
print("Cross validation complete")

[info] : Tuning hyper parameters..
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[info] : Hyper parameters tuning complete.
[info] : Cross validating trained model..
Cross-Validation Scores: [0.97333333 0.98666667 0.97333333 0.97333333 0.98666667]
Mean CV Score: 0.9786666666666667
Cross validation complete


## Predictions

In [32]:
# Load Test Set and Predict Values
print("Predicting for test data for label 3..")
label_3_test_data=test_data.drop(columns=['ID'])
transformed_label_3_test_x= pca.transform(label_3_test_data)
label_3_y_pred = list(best_clf.predict(transformed_label_3_test_x))
print("Prediction complete.")
print(len(label_3_y_pred))

Predicting for test data for label 3..
Prediction complete.
744


# Label 4

## Data preprocessing

In [14]:
print("[info] : Preprocessing data..")
# Prepare data for label 4
label_4_train_data=train_data.drop(columns=["label_1","label_2","label_3"])
label_4_valid_data=validation_data.drop(columns=["label_1","label_2","label_3"])

# Drop rows with missing values
label_4_train_data.dropna(inplace=True)
label_4_valid_data.dropna(inplace=True)

# Separate features and labels
label_4_train_x=label_4_train_data.drop(columns=["label_4"])
label_4_train_y=label_4_train_data["label_4"]
label_4_valid_x=label_4_valid_data.drop(columns=["label_4"])
label_4_valid_y=label_4_valid_data["label_4"]
print("[info] : Preprocessing complete.")

[info] : Preprocessing data..
[info] : Preprocessing complete.


## Apply PCA

In [15]:
print("[info] : Reducing dimensions using PCA..")
no_of_components=300
pca=PCA(n_components=no_of_components)
transformed_label_4_train_x = pca.fit_transform(label_4_train_x)
transformed_label_4_valid_x = pca.fit_transform(label_4_valid_x)
print("[info] : Dimensions reduction complete.")

[info] : Reducing dimensions using PCA..
[info] : Dimensions reduction complete.


## Train the classifier

In [16]:
# Choose SVM as the classifier
classifier = SVC()
param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.001, 0.01]
}
grid_search = GridSearchCV(classifier, param_grid, cv=5, n_jobs=-1,verbose=5)

print("[info] : Tuning hyper parameters..")
# Tune hyper parameters
grid_search.fit(transformed_label_4_train_x, label_4_train_y)
print("[info] : Hyper parameters tuning complete.")

# Get the best hyperparameters from the grid search
best_clf = grid_search.best_estimator_

print("[info] : Cross validating trained model..")
# Use Cross-Validation to Validate the Results
cv_scores = cross_val_score(best_clf, transformed_label_4_valid_x, label_4_valid_y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))
print("Cross validation complete")

[info] : Tuning hyper parameters..
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[info] : Hyper parameters tuning complete.
[info] : Cross validating trained model..
Cross-Validation Scores: [0.74       0.75333333 0.69333333 0.78666667 0.7       ]
Mean CV Score: 0.7346666666666666
Cross validation complete


## Predictions

In [33]:
# Load Test Set and Predict Values
print("Predicting for test data for label 4..")
label_4_test_data=test_data.drop(columns=['ID'])
transformed_label_4_test_x= pca.transform(label_4_test_data)
label_4_y_pred = list(best_clf.predict(transformed_label_4_test_x))
print("Prediction complete.")
print(len(label_4_y_pred))

Predicting for test data for label 4..
Prediction complete.
744


In [35]:
# Create output dataframe
ids=list(range(1,len(label_4_y_pred)+1))
data={"ID":ids,"label_1":label_1_y_pred,"label_2":label_2_y_pred,"label_3":label_3_y_pred,"label_4":label_4_y_pred}
output_dataframe = pd.DataFrame(data)
output_dataframe

Unnamed: 0,ID,label_1,label_2,label_3,label_4
0,1,6,6,6,6
1,2,6,6,6,6
2,3,6,6,6,6
3,4,6,6,6,6
4,5,6,6,6,6
...,...,...,...,...,...
739,740,4,4,4,4
740,741,6,6,6,6
741,742,6,6,6,6
742,743,6,6,6,6
