In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.decomposition import PCA
#grid search
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score




In [3]:
# Load the data
train_data = pd.read_csv('train.csv')
valid_data = pd.read_csv('valid.csv')
test_data = pd.read_csv('test.csv')

In [4]:
# Split data into features (X) and target labels (y)
X_train = train_data.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_label_1_train = train_data['label_1']
y_label_2_train = train_data['label_2']
y_label_3_train = train_data['label_3']
y_label_4_train = train_data['label_4']

X_valid = valid_data.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_valid_label1 = valid_data['label_1']
y_valid_label2 = valid_data['label_2']
y_valid_label3 = valid_data['label_3']
y_valid_label4 = valid_data['label_4']

X_test = test_data.drop(['ID'], axis=1)

output = pd.DataFrame(index=range(744))
output['ID'] = test_data['ID']



In [5]:
# count individual unique count of each label category
class_distribution =train_data['label_4'].value_counts()
print(class_distribution)

label_4
6     19938
2      1449
0       955
12      954
7       938
13      482
1       481
11      480
10      480
3       479
5       478
9       472
4       469
8       465
Name: count, dtype: int64


# Apply feature engineering techniques

In [6]:
sc = RobustScaler()

X_train_scaled = sc.fit_transform(X_train)
X_valid_scaled = sc.transform(X_valid)
X_test_scaled = sc.transform(X_test)

# Calculate the variance threshold
desired_variance = 0.97  # Set the desired explained variance
pca = PCA(n_components=desired_variance, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_scaled)
X_valid_pca =pca.transform(X_valid_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Get the number of components selected based on the variance threshold
n_components = pca.n_components_

# Cross validation for label 1

In [8]:
cv_score = cross_val_score(SVC(random_state=42, kernel='linear', gamma='auto'), X_train_pca, y_label_1_train, cv=5)

print(f"Cross-validation accuracy for svc label_1: {np.mean(cv_score):.2f}")

cv_score = cross_val_score(KNN(n_neighbors=5), X_train_pca, y_label_1_train, cv=5)

print(f"Cross-validation accuracy for knn_label_1: {np.mean(cv_score):.2f}")

Cross-validation accuracy for svc label_1: 0.89


[WinError 2] The system cannot find the file specified
  File "c:\users\sumeela\appdata\local\programs\python\python39\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\users\sumeela\appdata\local\programs\python\python39\lib\subprocess.py", line 501, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\users\sumeela\appdata\local\programs\python\python39\lib\subprocess.py", line 947, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\users\sumeela\appdata\local\programs\python\python39\lib\subprocess.py", line 1416, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


Cross-validation accuracy for knn_label_1: 0.78


# Grid search for label 1

In [7]:
#grid searchcv
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['auto', 'scale']}
kernal = ['linear', 'poly', 'rbf', 'sigmoid']
gamma = ['auto', 'scale']

gs_ = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs=SVC(C=10, gamma='scale', kernel='rbf')
gs = gs.fit(X_train_pca, y_label_1_train)


#evaluate train model
pred_label_1 = gs.predict(X_valid_pca)
validate1 = pd.DataFrame()
validate1['label1'] = pred_label_1
validate1.to_csv('validate1.csv', index=False)
print('Validate accuracy: %.3f' % accuracy_score(y_true=y_valid_label1, y_pred=pred_label_1))
print('Validate precision: %.3f' % precision_score(y_true=y_valid_label1, y_pred=pred_label_1, average='micro'))
print('Validate recall: %.3f' % recall_score(y_true=y_valid_label1, y_pred=pred_label_1,    average='micro'))
print('Validate f1: %.3f' % f1_score(y_true=y_valid_label1, y_pred=pred_label_1,    average='micro'))

Validate accuracy: 0.939
Validate precision: 0.939
Validate recall: 0.939
Validate f1: 0.939


# Cross validation for label 2


In [7]:
train_data_label_2 = train_data[train_data['label_2'].notna()]
X_train_label_2 = train_data_label_2.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_label2_train = train_data_label_2['label_2']

X_train_label2_scaled = sc.fit_transform(X_train_label_2)

valid_data_label_2 = valid_data[valid_data['label_2'].notna()]
X_valid_label_2 = valid_data_label_2.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
y_valid_label_2 = valid_data_label_2['label_2']

X_test_label_2 = test_data.drop(['ID'], axis=1)

X_valid_label2_scaled = sc.transform(X_valid_label_2)
X_test_label_2_scaled = sc.transform(X_test_label_2)


# Calculate the variance threshold
desired_variance = 0.97  # Set the desired explained variance
pca_label2 = PCA(n_components=desired_variance, svd_solver='full')
X_train_label2_pca = pca_label2.fit_transform(X_train_label2_scaled)
X_valid_label2_pca =pca_label2.transform(X_valid_label2_scaled)
X_test_label2_pca = pca_label2.transform(X_test_label_2_scaled)


# Get the number of components selected based on the variance threshold
n_components_label2 = pca.n_components_

print(X_train_label2_pca.shape)
print(y_label2_train.shape)

(28040, 217)
(28040,)


In [12]:
cv_score = cross_val_score(KNN(n_neighbors=5), X_train_label2_pca, y_label2_train, cv=5)
print(f"Cross-validation accuracy for knn_label_2: {np.mean(cv_score):.2f}")

cv_score = cross_val_score(SVC(random_state=42, kernel='linear', gamma='auto'), X_train_label2_pca, y_label2_train, cv=5)
print(f"Cross-validation accuracy for svc_label_2: {np.mean(cv_score):.2f}")


Cross-validation accuracy for knn_label_2: 0.49
Cross-validation accuracy for svc_label_2: 0.42


# Grid search for label 2

In [17]:
#grid searchcv for knn
param_grid = {
    'n_neighbors': np.arange(1, 25),
     'weights': ['uniform', 'distance'],
    'p': [1, 2]  # Corresponds to Manhattan and Euclidean distances
              }
gs_label_2_ = GridSearchCV(estimator=KNN(), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs_label_2 = KNN(n_neighbors=5, p=1, weights='uniform')
gs_label_2 = gs_label_2.fit(X_train_label2_pca, y_label2_train)
validate2 = pd.DataFrame()


#evaluate train model
pred_label_2 = gs_label_2.predict(X_valid_label2_pca)
validate2['label2'] = pred_label_2
validate2.to_csv('validate2.csv', index=False)
print('Validate accuracy: %.3f' % accuracy_score(y_true=y_valid_label_2, y_pred=pred_label_2))
print('Validate precision: %.3f' % precision_score(y_true=y_valid_label_2, y_pred=pred_label_2, average='micro'))
print('Validate recall: %.3f' % recall_score(y_true=y_valid_label_2, y_pred=pred_label_2,    average='micro'))
print('Validate f1: %.3f' % f1_score(y_true=y_valid_label_2, y_pred=pred_label_2,    average='micro'))

Validate accuracy: 0.859
Validate precision: 0.859
Validate recall: 0.859
Validate f1: 0.859


In [16]:
#svc grid searchcv
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['auto', 'scale']}
gs_label_2_svc = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs_label_2_svc = gs_label_2_svc.fit(X_train_label2_pca, y_label2_train)

#evaluate train model
pred_label_2_svc = gs_label_2_svc.predict(X_valid_label2_pca)
print('Validate accuracy: %.3f' % accuracy_score(y_true=y_valid_label_2, y_pred=pred_label_2_svc))
print('Validate precision: %.3f' % precision_score(y_true=y_valid_label_2, y_pred=pred_label_2_svc, average='micro'))
print('Validate recall: %.3f' % recall_score(y_true=y_valid_label_2,  y_pred=pred_label_2_svc,    average='micro'))
print('Validate f1: %.3f' % f1_score(y_true=y_valid_label_2, y_pred=pred_label_2_svc,    average='micro'))


KeyboardInterrupt: 

# Cross validation for label 3

In [17]:
cv_score = cross_val_score(KNN(n_neighbors=5), X_train_pca, y_label_3_train, cv=5)
print(f"Cross-validation accuracy for knn_label_3: {np.mean(cv_score):.2f}")

cv_score = cross_val_score(SVC(random_state=42, kernel='linear', gamma='auto'), X_train_pca, y_label_3_train, cv=5)
print(f"Cross-validation accuracy for svc_label_3: {np.mean(cv_score):.2f}")


Cross-validation accuracy for knn_label_3: 0.92
Cross-validation accuracy for svc_label_3: 0.98


# Grid search for label 3

In [18]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['auto', 'scale']}
gs_label_3_svc_ = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs_label_3_svc = SVC(C=10, gamma='scale', kernel='rbf')
gs_label_3_svc = gs_label_3_svc.fit(X_train_pca, y_label_3_train)

#evaluate train model
pred_label_3_svc = gs_label_3_svc.predict(X_valid_pca)
validate3 = pd.DataFrame()
validate3['label3'] = pred_label_3_svc
validate3.to_csv('validate3.csv', index=False)
print('Validate accuracy for label 3: %.3f' % accuracy_score(y_true=y_valid_label3, y_pred=pred_label_3_svc))
print('Validate precision for label 3: %.3f' % precision_score(y_true=y_valid_label3, y_pred=pred_label_3_svc, average='micro'))
print('Validate recall for label 3: %.3f' % recall_score(y_true=y_valid_label3, y_pred=pred_label_3_svc,    average='micro'))
print('Validate f1 for label 3: %.3f' % f1_score(y_true=y_valid_label3, y_pred=pred_label_3_svc,    average='micro'))


Validate accuracy for label 3: 0.997
Validate precision for label 3: 0.997
Validate recall for label 3: 0.997
Validate f1 for label 3: 0.997


# Cross validation for label 4

In [19]:
cv_score = cross_val_score(KNN(n_neighbors=5), X_train_pca, y_label_4_train, cv=5)
print(f"Cross-validation accuracy for knn_label_4: {np.mean(cv_score):.2f}")

cv_score = cross_val_score(SVC(random_state=42, kernel='linear', gamma='auto'), X_train_pca, y_label_4_train, cv=5)
print(f"Cross-validation accuracy for svc_label_4: {np.mean(cv_score):.2f}")

Cross-validation accuracy for knn_label_4: 0.82
Cross-validation accuracy for svc_label_4: 0.82


# Grid search for label 4

In [19]:
# grid searchcv for knn
param_grid = {'n_neighbors': np.arange(1, 25),
                'weights': ['uniform', 'distance'],
              'p': [1, 2]    }
gs_label_4_ = GridSearchCV(estimator=KNN(), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs_label_4 = KNN(n_neighbors=10, p=1, weights='distance')
gs_label_4 = gs_label_4.fit(X_train_pca, y_label_4_train)

#evaluate train model
pred_label_4 = gs_label_4.predict(X_valid_pca)
validate4 = pd.DataFrame()
validate4['label4'] = pred_label_4
validate4.to_csv('validate4.csv', index=False)
print('Validate accuracy for label 4: %.3f' % accuracy_score(y_true=y_valid_label4, y_pred=pred_label_4))
print('Validate precision for label 4: %.3f' % precision_score(y_true=y_valid_label4, y_pred=pred_label_4, average='micro'))
print('Validate recall for label 4: %.3f' % recall_score(y_true=y_valid_label4, y_pred=pred_label_4,    average='micro'))
print('Validate f1 for label 4: %.3f' % f1_score(y_true=y_valid_label4, y_pred=pred_label_4,    average='micro'))


Validate accuracy for label 4: 0.923
Validate precision for label 4: 0.923
Validate recall for label 4: 0.923
Validate f1 for label 4: 0.923


In [23]:
# grid searchcv for svc
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['auto', 'scale']}
gs_label_4_svc = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs_label_4_svc = gs_label_4_svc.fit(X_train_pca, y_label_4_train)
print(gs_label_4_svc.best_score_)
print(gs_label_4_svc.best_params_)
#evaluate train model
pred_label_4_svc = gs_label_4_svc.predict(X_valid_pca)
print('Validate accuracy for label 4: %.3f' % accuracy_score(y_true=y_valid_label4, y_pred=pred_label_4_svc))
print('Validate precision for label 4: %.3f' % precision_score(y_true=y_valid_label4, y_pred=pred_label_4_svc, average='micro'))
print('Validate recall for label 4: %.3f' % recall_score(y_true=y_valid_label4, y_pred=pred_label_4_svc,    average='micro'))
print('Validate f1 for label 4: %.3f' % f1_score(y_true=y_valid_label4, y_pred=pred_label_4_svc,    average='micro'))


KeyboardInterrupt: 

In [15]:
print(X_test.describe())

        feature_1   feature_2   feature_3   feature_4   feature_5   feature_6  \
count  744.000000  744.000000  744.000000  744.000000  744.000000  744.000000   
mean     0.005398    0.046413   -0.003152    0.007886   -0.084819   -0.042217   
std      0.077064    0.047604    0.060536    0.081165    0.081220    0.056322   
min     -0.215194   -0.088241   -0.179731   -0.226406   -0.463127   -0.235149   
25%     -0.044833    0.013775   -0.044619   -0.036708   -0.132045   -0.077440   
50%     -0.000147    0.048187   -0.006010    0.013099   -0.083526   -0.041958   
75%      0.049941    0.080107    0.036146    0.059325   -0.032098   -0.003731   
max      0.231716    0.194577    0.232407    0.232575    0.214991    0.143599   

        feature_7   feature_8   feature_9  feature_10  ...  feature_759  \
count  744.000000  744.000000  744.000000  744.000000  ...   744.000000   
mean     0.115549   -0.060058   -0.014774   -0.020346  ...    -0.004586   
std      0.099040    0.067562    0.060351    

In [24]:

pred_test_label1= gs.predict(X_test_pca)
pred_test_label2= gs_label_2.predict(X_test_label2_pca)
pred_test_label3= gs_label_3_svc.predict(X_test_pca)
pred_test_label4= gs_label_4.predict(X_test_pca)

output['label_1'] = pred_test_label1
output['label_2'] = pred_test_label2
output['label_3'] = pred_test_label3
output['label_4'] = pred_test_label4

output.to_csv('label.csv', index=False)
