Connecting drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score

Importing datasets

In [None]:
train = pd.read_csv('/content/drive/MyDrive/lab1/train.csv')
valid = pd.read_csv('/content/drive/MyDrive/lab1/valid.csv')
test = pd.read_csv('/content/drive/MyDrive/lab1/test.csv')

train.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,label_1,label_2,label_3,label_4
0,-1.928826,-0.29154,1.105909,2.070652,0.657838,-0.67094,-0.968238,0.049916,1.111149,-1.834592,...,-0.691538,-1.152522,0.441157,-1.494434,-0.344974,-0.395641,45,,1,6
1,-1.709277,-1.008998,0.956611,4.313823,-0.669455,1.928765,0.16255,0.152173,2.356505,-2.898549,...,-0.668235,-0.532695,0.151163,-1.876885,1.651534,-2.70649,45,,1,6
2,-1.906183,-0.357562,-0.682627,4.651838,-0.88494,1.159512,1.354481,-1.432248,2.160144,-3.175735,...,-0.026728,0.440408,2.745906,-0.680754,-0.643588,-2.587203,45,,1,6
3,0.697872,-2.218567,-0.572214,1.192062,0.329554,0.741364,0.962354,0.993512,2.896854,-1.60038,...,0.709032,0.812106,1.055266,-0.915258,-0.924856,-1.318964,45,,1,6
4,-1.976895,-0.496308,0.368102,3.739787,-0.132058,1.23498,-0.501003,0.881554,1.630941,-3.853953,...,-0.5997,-1.024829,1.869753,-2.49857,3.268253,-1.356606,45,,1,6


Preprocessing Data

In [None]:
columns_to_check = ['label_2']
train = train.dropna(subset=columns_to_check, how='any')
valid = valid.dropna(subset=columns_to_check, how='any')

datasets = {'train': train, 'valid': valid, 'test': test}

for name, dataset in datasets.items():
    datasets[name] = dataset.fillna(dataset.mean())

valid.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,label_1,label_2,label_3,label_4
14,-0.172181,-2.506089,-0.087602,1.013303,0.83855,-0.172347,-0.543211,1.23544,1.376382,-0.972239,...,-1.006134,0.184453,0.415477,-1.716953,-0.827742,-2.045459,5,25.0,1,6
15,-0.681202,-1.663909,1.788502,0.857073,0.709822,0.8349,0.039072,1.219533,0.483133,-1.896562,...,1.675475,0.322208,1.401139,-1.54936,-1.246808,-1.945575,5,25.0,1,6
16,-1.634366,0.461024,0.61544,1.199882,0.084413,-1.177391,0.557994,0.975045,0.033425,-0.920055,...,-0.532355,0.251711,1.900047,-2.395971,-0.667466,-2.955309,5,25.0,1,6
17,-0.963445,-0.21536,-0.653633,0.456256,0.52319,0.178436,0.157641,0.747167,-0.100809,-2.305725,...,-0.296968,-0.438388,1.488559,-1.781888,0.081912,-2.422592,5,25.0,1,6
18,-0.583766,-1.942095,-0.68659,1.630802,0.92144,0.80916,-0.389961,1.604646,-0.304054,-1.762844,...,0.749547,-0.493671,2.532719,-1.705593,1.29116,-2.13586,5,25.0,1,6


Separate labels and features

In [None]:
def separate_features_labels(data):
    features = data.drop(columns=['label_1', 'label_2', 'label_3', 'label_4'])
    labels = data[['label_2']]
    return features, labels

train_X, train_y = separate_features_labels(train)
valid_X, valid_y = separate_features_labels(valid)
test_X, test_y = separate_features_labels(test)

train_X.head()


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_247,feature_248,feature_249,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256
480,-2.133907,-1.851185,-0.401913,0.57963,0.701037,-0.21269,-0.708243,3.016801,0.464313,-0.8253,...,0.33223,-1.325403,0.563739,-0.18027,-0.598476,2.794869,1.049926,-2.829166,0.908335,-1.295668
481,-0.404385,-0.836757,0.762689,1.637213,0.665621,0.625971,0.262635,1.408128,0.776417,-2.443906,...,-1.451546,-0.435936,-0.216273,-1.313571,1.234601,0.76425,1.84826,-1.138066,-0.143287,-1.997515
482,-1.642496,-1.278682,0.164281,1.371167,0.258172,-0.896479,-0.051402,0.623235,0.367427,-1.211149,...,0.652031,-0.044023,1.547585,-0.239684,0.269733,1.14364,1.827864,-2.056184,-0.070624,-2.222077
483,-0.191927,-2.451684,0.547922,-0.04168,0.804643,-0.324783,0.806728,1.507769,-0.657439,-0.784736,...,-0.799259,-0.702702,0.617131,-0.730333,1.085195,-1.203888,2.832388,-1.94399,0.208316,-2.421425
484,-0.510969,-0.781623,-1.478504,0.794873,0.354653,0.268014,-1.677308,1.627065,0.98587,-2.165971,...,-0.60751,-1.884749,1.849695,0.071822,-0.820887,1.300054,2.499395,-1.692642,0.563708,-0.975423


# 1. Label 2 without feature engineering

Duplicate the labels and features

In [None]:
train_X_copy = train_X.copy()
valid_X_copy = valid_X.copy()
test_X_copy = test_X.copy()

train_y_copy = train_y.copy()
valid_y_copy = valid_y.copy()
test_y_copy = test_y.copy()

Standardization

In [None]:
scaler = StandardScaler()
train_X_copy = scaler.fit_transform(train_X_copy)
valid_X_copy = scaler.transform(valid_X_copy)
test_X_copy = scaler.transform(test_X_copy)

In [None]:
knn_model = KNeighborsRegressor()
knn_model.fit(train_X_copy, train_y_copy)

In [None]:
datasets = {'train': (train_X_copy, train_y_copy),
            'valid': (valid_X_copy, valid_y_copy)}

for data_name, (X, y) in datasets.items():
    y_pred = knn_model.predict(X)

    mse = mean_squared_error(y, y_pred)
    r2s = r2_score(y, y_pred)

    print(f"Metrics for KNeighborsRegressor on {data_name} data:")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared Score: {r2s:.2f}")
    print("\n")

y_pred_before_test = knn_model.predict(test_X_copy)

Metrics for KNeighborsRegressor on train data:
Mean Squared Error: 0.38
R-squared Score: 0.99


Metrics for KNeighborsRegressor on valid data:
Mean Squared Error: 0.62
R-squared Score: 0.99




# 1. Label 1 with feature engineering

Identifying highly correlated features

In [None]:
#Calculate the correlation matrix of features

correlation_matrix = train_X.corr()

correlation_threshold = 0.45
# Create a boolean mask indicating highly correlated features
mask = np.abs(correlation_matrix) > correlation_threshold

# Exclude the diagonal and upper triangular part to avoid redundancy
mask = np.triu(mask, k=1)

# Find column names of highly correlated features
highly_correlated = set(correlation_matrix.columns[mask.any(axis=0)])

print(highly_correlated)

{'feature_63', 'feature_164', 'feature_198', 'feature_48', 'feature_131', 'feature_237', 'feature_73', 'feature_161', 'feature_103', 'feature_162', 'feature_19', 'feature_173', 'feature_163', 'feature_226', 'feature_144', 'feature_174', 'feature_139', 'feature_137', 'feature_179', 'feature_183', 'feature_225', 'feature_21', 'feature_152', 'feature_234', 'feature_43', 'feature_58', 'feature_109', 'feature_250', 'feature_227', 'feature_106', 'feature_202', 'feature_247', 'feature_253', 'feature_94', 'feature_101', 'feature_200', 'feature_230', 'feature_203', 'feature_122', 'feature_115', 'feature_38', 'feature_184', 'feature_213', 'feature_222', 'feature_236', 'feature_108', 'feature_120', 'feature_209', 'feature_130', 'feature_254', 'feature_175'}


Remove highly correlated features

In [None]:
train_X = train_X.drop(columns=highly_correlated)
valid_X = valid_X.drop(columns=highly_correlated)
test_X = test_X.drop(columns=highly_correlated)

Standaridization of features

In [None]:
scaler = StandardScaler()

train_features_standardized = scaler.fit_transform(train_X)
valid_features_standardized = scaler.transform(valid_X)
test_features_standardized = scaler.transform(test_X)

Extracting features

In [None]:
# Set the variance threshold for PCA
variance_threshold = 0.95

# Create a PCA transformer with the specified variance threshold
pca_transformer = PCA(n_components=variance_threshold, svd_solver='full')

# Apply PCA transformation to standardized features
pca_train_features_transformed = pca_transformer.fit_transform(train_features_standardized)
pca_valid_features_transformed = pca_transformer.transform(valid_features_standardized)
pca_test_features_transformed = pca_transformer.transform(test_features_standardized)

Model prediction

In [None]:
datasets = {'train': (pca_train_features_transformed, train_y),
            'valid': (pca_valid_features_transformed, valid_y)}
model = KNeighborsRegressor()
model.fit(pca_train_features_transformed, train_y)

for data_name, (X, y) in datasets.items():
    y_pred = model.predict(X)

    mse = mean_squared_error(y, y_pred)
    r2s = r2_score(y, y_pred)

    print(f"Metrics for KNeighborsRegressor on {data_name} data:")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared Score: {r2s:.2f}")
    print("\n")

y_pred_test = model.predict(pca_test_features_transformed)

Metrics for KNeighborsRegressor on train data:
Mean Squared Error: 0.50
R-squared Score: 0.98


Metrics for KNeighborsRegressor on valid data:
Mean Squared Error: 2.05
R-squared Score: 0.95




CSV Generation

In [None]:
feature_count = pca_test_features_transformed.shape[1]
feature_row = np.repeat('new_feature_', feature_count)
count_row = list(map(str, np.arange(1, feature_count+1)))
header_row = np.char.add(feature_row, count_row)

df = pd.DataFrame(pca_test_features_transformed, columns  = header_row)

df.insert(loc=0, column='Predicted labels before feature engineering', value=y_pred_before_test)
df.insert(loc=1, column='Predicted labels after feature engineering', value=y_pred_test)
df.insert(loc=2, column='No of new features', value=np.repeat(feature_count, pca_test_features_transformed.shape[0]))

df.to_csv('/content/drive/MyDrive/lab1/output/190112E_label_2.csv', index=False)