Define label name and feature names

In [None]:
import pandas as pd
import numpy as np

LBL = "label_3" # Speaker age
LABELS = ['label_1','label_2', 'label_3', 'label_4']
FEATURES = [f'feature_{i}' for i in range(1, 257)]


Read training,validation and test data

In [None]:
from google.colab import drive
drive.mount('/content/drive')


train_df = pd.read_csv("/content/drive/MyDrive/ML_lab1/train.csv")
valid_df = pd.read_csv("/content/drive/MyDrive/ML_lab1/valid.csv")
test_df = pd.read_csv("/content/drive/MyDrive/ML_lab1/test.csv")


Mounted at /content/drive


Initialize dictionaries to store data

In [None]:
train_x = {}
valid_x = {}
test_x = {}
train_y = {}
valid_y = {}
test_y = {}

Prepare and preprocess the data

In [None]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
train_x[LBL] = pd.DataFrame(scaler.fit_transform(train_df.drop(LABELS, axis=1)), columns = FEATURES)
train_y[LBL] = train_df[LBL]
valid_x[LBL] = pd.DataFrame(scaler.transform(valid_df.drop(LABELS, axis=1)), columns = FEATURES)
valid_y[LBL] = valid_df[LBL]
test_x[LBL] = pd.DataFrame(scaler.transform(test_df.drop(LABELS, axis=1)), columns=FEATURES)

# For Label 3

Train SVC classifier

In [None]:
from sklearn import svm

classifier = svm.SVC(kernel = 'linear')
classifier.fit(train_x[LBL], train_y[LBL])

Evaluate performance of SVC classifier on the validation dataset

In [None]:
from sklearn import metrics

y_predict_valid = classifier.predict(valid_x[LBL])

print("SVC Classifier Evaluation on Validation Set:")
print("Accuracy:", metrics.accuracy_score(valid_y[LBL], y_predict_valid))
print("Precision:", metrics.precision_score(valid_y[LBL], y_predict_valid))
print("Recall:", metrics.recall_score(valid_y[LBL], y_predict_valid))


SVC Classifier Evaluation on Validation Set:
Accuracy: 0.9986666666666667
Precision: 1.0
Recall: 0.9983552631578947


Predict lable 1 for test dataset  using SVM classifier

In [None]:
y_predict_test_before = classifier.predict(test_x[LBL])


## Applying Feature Engineering techniques

### Using SelectKBest

Apply SelectKBest on original features

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=20)
x_selected = selector.fit_transform(train_x[LBL], train_y[LBL])

In [None]:
train_x_trans = pd.DataFrame(x_selected)
valid_x_trans = pd.DataFrame(selector.transform(valid_x[LBL]))
test_x_trans = pd.DataFrame(selector.transform(test_x[LBL]))

print("Shape after feature reduction:", train_x_trans.shape)

Shape after feature reduction: (28520, 20)


Train the SVC classifier on SelectKBest-selected features


In [None]:
classifier = svm.SVC(kernel='linear')
classifier.fit(train_x_trans, train_y[LBL])

Evaluate performance of SVC classifier on SelectKBest-selected features using validation dataset

In [None]:
y_predict_valid_pca = classifier.predict(valid_x_trans)

In [None]:
print("SVC Classifier Evaluation on SelectKBest-selected features (Validation Set):")
print("Accuracy:", metrics.accuracy_score(valid_y[LBL], y_predict_valid_pca))
print("Precision:", metrics.precision_score(valid_y[LBL], y_predict_valid_pca))
print("Recall:", metrics.recall_score(valid_y[LBL], y_predict_valid_pca))

SVC Classifier Evaluation on SelectKBest-selected features (Validation Set):
Accuracy: 0.984
Precision: 0.9917491749174917
Recall: 0.9884868421052632


Predict lable 1 for test dataset after feature reduction

In [None]:
y_predict_test_after = classifier.predict(valid_x_trans)

In [None]:
output_df = pd.DataFrame({
    'Predicted labels before feature engineering': y_predict_test_before,
    'Predicted labels after feature engineering': y_predict_test_after,
    'No of new features': test_x_trans.shape[1]
})


for i in range(test_x_trans.shape[1]):
    output_df[f'new_feature_{i+1}'] = test_x_trans.iloc[:, i]
# Create a list of new column names
new_columns = [f'new_feature_{i+1}' for i in range(test_x_trans.shape[1], 256)]

# Concatenate the new columns to the existing DataFrame
output_df = pd.concat([output_df, pd.DataFrame(columns=new_columns)], axis=1)


In [None]:
output_df.to_csv('/content/drive/MyDrive/Colab Notebooks/files/190334K_label_3.csv', index=False)