Mounting drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

Importing datasets

In [None]:
train = pd.read_csv('/content/drive/MyDrive/lab1/train.csv')
valid = pd.read_csv('/content/drive/MyDrive/lab1/valid.csv')
test = pd.read_csv('/content/drive/MyDrive/lab1/test.csv')

train.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,label_1,label_2,label_3,label_4
0,-1.928826,-0.29154,1.105909,2.070652,0.657838,-0.67094,-0.968238,0.049916,1.111149,-1.834592,...,-0.691538,-1.152522,0.441157,-1.494434,-0.344974,-0.395641,45,,1,6
1,-1.709277,-1.008998,0.956611,4.313823,-0.669455,1.928765,0.16255,0.152173,2.356505,-2.898549,...,-0.668235,-0.532695,0.151163,-1.876885,1.651534,-2.70649,45,,1,6
2,-1.906183,-0.357562,-0.682627,4.651838,-0.88494,1.159512,1.354481,-1.432248,2.160144,-3.175735,...,-0.026728,0.440408,2.745906,-0.680754,-0.643588,-2.587203,45,,1,6
3,0.697872,-2.218567,-0.572214,1.192062,0.329554,0.741364,0.962354,0.993512,2.896854,-1.60038,...,0.709032,0.812106,1.055266,-0.915258,-0.924856,-1.318964,45,,1,6
4,-1.976895,-0.496308,0.368102,3.739787,-0.132058,1.23498,-0.501003,0.881554,1.630941,-3.853953,...,-0.5997,-1.024829,1.869753,-2.49857,3.268253,-1.356606,45,,1,6


Preprocessing Data

In [None]:
#Remove null rows
columns_to_check = ['label_4']
train = train.dropna(subset=columns_to_check, how='any')

datasets = {'train': train, 'valid': valid, 'test': test}

for name, dataset in datasets.items():
    datasets[name] = dataset.fillna(dataset.mean())

train.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,label_1,label_2,label_3,label_4
0,-1.928826,-0.29154,1.105909,2.070652,0.657838,-0.67094,-0.968238,0.049916,1.111149,-1.834592,...,-0.691538,-1.152522,0.441157,-1.494434,-0.344974,-0.395641,45,,1,6
1,-1.709277,-1.008998,0.956611,4.313823,-0.669455,1.928765,0.16255,0.152173,2.356505,-2.898549,...,-0.668235,-0.532695,0.151163,-1.876885,1.651534,-2.70649,45,,1,6
2,-1.906183,-0.357562,-0.682627,4.651838,-0.88494,1.159512,1.354481,-1.432248,2.160144,-3.175735,...,-0.026728,0.440408,2.745906,-0.680754,-0.643588,-2.587203,45,,1,6
3,0.697872,-2.218567,-0.572214,1.192062,0.329554,0.741364,0.962354,0.993512,2.896854,-1.60038,...,0.709032,0.812106,1.055266,-0.915258,-0.924856,-1.318964,45,,1,6
4,-1.976895,-0.496308,0.368102,3.739787,-0.132058,1.23498,-0.501003,0.881554,1.630941,-3.853953,...,-0.5997,-1.024829,1.869753,-2.49857,3.268253,-1.356606,45,,1,6


Separate labels and features

In [None]:
def separate_features_labels(data):
    features = data.drop(columns=['label_1', 'label_2', 'label_3', 'label_4'])
    labels = data[['label_4']]
    return features, labels

train_X, train_y = separate_features_labels(train)
valid_X, valid_y = separate_features_labels(valid)
test_X, test_y = separate_features_labels(test)

train_X.head()


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_247,feature_248,feature_249,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256
0,-1.928826,-0.29154,1.105909,2.070652,0.657838,-0.67094,-0.968238,0.049916,1.111149,-1.834592,...,2.029255,1.007398,-0.317517,-0.148447,-0.691538,-1.152522,0.441157,-1.494434,-0.344974,-0.395641
1,-1.709277,-1.008998,0.956611,4.313823,-0.669455,1.928765,0.16255,0.152173,2.356505,-2.898549,...,1.260712,-1.55912,-1.369495,-1.253272,-0.668235,-0.532695,0.151163,-1.876885,1.651534,-2.70649
2,-1.906183,-0.357562,-0.682627,4.651838,-0.88494,1.159512,1.354481,-1.432248,2.160144,-3.175735,...,0.223475,-0.136551,-2.210191,-1.46333,-0.026728,0.440408,2.745906,-0.680754,-0.643588,-2.587203
3,0.697872,-2.218567,-0.572214,1.192062,0.329554,0.741364,0.962354,0.993512,2.896854,-1.60038,...,0.970327,-0.718073,-2.546477,-2.172275,0.709032,0.812106,1.055266,-0.915258,-0.924856,-1.318964
4,-1.976895,-0.496308,0.368102,3.739787,-0.132058,1.23498,-0.501003,0.881554,1.630941,-3.853953,...,1.291582,-0.739541,-1.097638,-0.574205,-0.5997,-1.024829,1.869753,-2.49857,3.268253,-1.356606


# 1. Label 4 without feature engineering

Duplicate the labels and features

In [None]:
train_X_copy = train_X.copy()
valid_X_copy = valid_X.copy()
test_X_copy = test_X.copy()

train_y_copy = train_y.copy()
valid_y_copy = valid_y.copy()
test_y_copy = test_y.copy()

Standardization

In [None]:
scaler = StandardScaler()
train_X_copy = scaler.fit_transform(train_X_copy)
valid_X_copy = scaler.transform(valid_X_copy)
test_X_copy = scaler.transform(test_X_copy)

In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(train_X_copy, train_y_copy)

  return self._fit(X, y)


Evaluation

In [None]:
datasets = {'train': (train_X_copy, train_y_copy), 'valid': (valid_X_copy, valid_y_copy)}

def evaluate_and_print_metrics(y_true, y_pred, prefix):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_true, y_pred, average='weighted')

    print(f"Metrics for KNN on {prefix} dataset:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print("\n")

for data_name, (X, y) in datasets.items():
    y_pred = knn_model.predict(X)
    evaluate_and_print_metrics(y, y_pred, data_name)

y_pred_before_test = knn_model.predict(test_X_copy)

Metrics for KNN on train dataset:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00


Metrics for KNN on valid dataset:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99




# 1. Label 4 with feature engineering

Identifying high correlated features

In [None]:
#Calculate the correlation matrix of features

correlation_matrix = train_X.corr()

correlation_threshold = 0.5
# Create a boolean mask indicating highly correlated features
mask = np.abs(correlation_matrix) > correlation_threshold

# Exclude the diagonal and upper triangular part to avoid redundancy
mask = np.triu(mask, k=1)

# Find column names of highly correlated features
highly_correlated = set(correlation_matrix.columns[mask.any(axis=0)])

print(len(highly_correlated))

0


Remove high correlated features

In [None]:
train_X = train_X.drop(columns=highly_correlated)
valid_X = valid_X.drop(columns=highly_correlated)
test_X = test_X.drop(columns=highly_correlated)

Standaridization of features

In [None]:
scaler = StandardScaler()

train_features_standardized = scaler.fit_transform(train_X)
valid_features_standardized = scaler.transform(valid_X)
test_features_standardized = scaler.transform(test_X)

Extracting features

In [None]:
# Set the variance threshold for PCA
variance_threshold = 0.95

# Create a PCA transformer with the specified variance threshold
pca_transformer = PCA(n_components=variance_threshold, svd_solver='full')

# Apply PCA transformation to standardized features
pca_train_features_transformed = pca_transformer.fit_transform(train_features_standardized)
pca_valid_features_transformed = pca_transformer.transform(valid_features_standardized)
pca_test_features_transformed = pca_transformer.transform(test_features_standardized)

Final evaluation

In [None]:
knn_model = KNeighborsClassifier()

# Train the KNN model on the training data
knn_model.fit(pca_train_features_transformed, train_y)

# Predict on the train data
y_pred_train = knn_model.predict(pca_train_features_transformed)

# Calculate metrics for classification evaluation on train data
accuracy_train = accuracy_score(train_y, y_pred_train)
precision_train = precision_score(train_y, y_pred_train, average='weighted', zero_division=1)
recall_train = recall_score(train_y, y_pred_train, average='weighted')

print("Metrics for KNN on train data:")
print(f"Accuracy: {accuracy_train:.2f}")
print(f"Precision: {precision_train:.2f}")
print(f"Recall: {recall_train:.2f}")
print("\n")

# Predict on the validation data
y_pred_valid = knn_model.predict(pca_valid_features_transformed)

# Calculate metrics for classification evaluation on validation data
accuracy_valid = accuracy_score(valid_y, y_pred_valid)
precision_valid = precision_score(valid_y, y_pred_valid, average='weighted', zero_division=1)
recall_valid = recall_score(valid_y, y_pred_valid, average='weighted')

print("Metrics for KNN on validation data:")
print(f"Accuracy: {accuracy_valid:.2f}")
print(f"Precision: {precision_valid:.2f}")
print(f"Recall: {recall_valid:.2f}")
print("\n")

# Predict on the test data
y_pred_test = knn_model.predict(pca_test_features_transformed)

  return self._fit(X, y)


Metrics for KNN on train data:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99


Metrics for KNN on validation data:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99




CSV Generation

In [None]:

feature_count = pca_test_features_transformed.shape[1]
feature_row = np.repeat('new_feature_', feature_count)
count_row = list(map(str, np.arange(1, feature_count+1)))
header_row = np.char.add(feature_row, count_row)

df = pd.DataFrame(pca_test_features_transformed, columns  = header_row)

df.insert(loc=0, column='Predicted labels before feature engineering', value=y_pred_before_test)
df.insert(loc=1, column='Predicted labels after feature engineering', value=y_pred_test)
df.insert(loc=2, column='No of new features', value=np.repeat(feature_count, pca_test_features_transformed.shape[0]))

df.to_csv('/content/drive/MyDrive/lab1/output/190112E_label_4.csv', index=False)