In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def read_dataset(file, test=False):
  rows = list()

  with open(file) as f:
    for line in f.readlines():
        line = line.strip().replace('\n', '')
        if len(line) == 0: continue
        row = list()
        for value in line.split(','):
          if value == 'NA':
            row.append(None)
            continue
          try:
            row.append(float(value))
          except:
            row.append(value)
        np.expand_dims(row, axis=0)
        rows.append(row)
  dataset = np.vstack([row for row in rows])

  types = {}

  for i in range(1,34):
    if i > 14:
      types['X'+str(i)] = 'float32'
  
  if not test:

    types['Y'] = 'float32'

    cols = ['X'+str(i) for i in range(1,34)]+['Y']
  else:
    cols = ['X'+str(i) for i in range(1,34)]
  ds_frame = pd.DataFrame(data=dataset, columns = cols).astype(
      types
  )

  return ds_frame

In [None]:
# fill in missing values
# for string values we fill in the missing values using the most frequent value
# for numerical values and since the data is normalized we fill in the missing values with zero since it is the mean of
# any normalized data

dataset = read_dataset('train.csv')
dataset.isna().any()

X1      True
X2      True
X3      True
X4     False
X5     False
X6      True
X7      True
X8      True
X9     False
X10    False
X11    False
X12    False
X13     True
X14     True
X15     True
X16     True
X17     True
X18     True
X19     True
X20     True
X21     True
X22     True
X23     True
X24     True
X25     True
X26     True
X27     True
X28     True
X29     True
X30     True
X31     True
X32     True
X33     True
Y      False
dtype: bool

In [None]:
def find_mode(ar):
  return stats.mode(ar).mode[0]

In [None]:
def fill_missing_values(dataset):

  # ['X1', 'X2','X3','X6','X7','X8','X13', 'X14']
  
  values = {
      'X1': find_mode(dataset['X1'].to_numpy().astype(np.str)),
      'X2': find_mode(dataset['X2'].to_numpy().astype(np.str)),
      'X3': find_mode(dataset['X3'].to_numpy().astype(np.str)), 
      'X6': find_mode(dataset['X6'].to_numpy().astype(np.str)),
      'X7': find_mode(dataset['X7'].to_numpy().astype(np.str)), 
      'X8': find_mode(dataset['X8'].to_numpy().astype(np.str)), 
      'X13': find_mode(dataset['X13'].to_numpy().astype(np.str)),
      'X14': find_mode(dataset['X14'].to_numpy().astype(np.str)),
  }

  for i in range(15, 34):
    values['X'+str(i)] = 0

  dataset.fillna(value=values, inplace=True)
  print(dataset.head(n=10))
  dataset.isna().any()
  
  return dataset

In [None]:
dataset = fill_missing_values(dataset)

   X1  X2  X3  X4  X5  X6  X7  ...    X28    X29    X30    X31    X32    X33    Y
0  V1  V1  V1  V1  V1  V1  V1  ...  0.182  0.034 -0.172  0.401  0.393  0.216  1.0
1  V1  V1  V1  V1  V1  V1  V2  ...  1.098  0.034  1.160  0.401  0.037  0.216  1.0
2  V1  V1  V1  V1  V1  V1  V1  ...  1.098  0.034  0.716  0.401  0.724  0.216 -1.0
3  V1  V1  V1  V1  V1  V1  V1  ...  1.098  0.034  0.716  0.401  0.712  0.216 -1.0
4  V1  V1  V1  V1  V1  V1  V1  ...  0.182  0.034  0.716  0.401  0.393  0.216  1.0
5  V1  V1  V1  V1  V1  V1  V1  ...  0.731  0.034  0.716  0.401  0.724  0.216  1.0
6  V1  V1  V1  V1  V1  V1  V1  ... -0.735  0.034  1.160  0.401 -0.605  0.216  1.0
7  V1  V1  V1  V1  V1  V1  V1  ...  0.182  0.034  0.716  0.401  0.037  0.216 -1.0
8  V2  V1  V1  V2  V2  V1  V1  ...  0.182  0.034  1.160  0.401  0.037  0.216  1.0
9  V2  V1  V1  V2  V2  V1  V1  ...  0.182 -0.510 -0.394  0.401 -0.605  0.216 -1.0

[10 rows x 34 columns]


In [None]:
Y = dataset['Y'].astype(np.int32)
X = dataset.drop(columns=['Y'])

In [None]:
encoder = OneHotEncoder()

nominal_data = X[['X'+str(i) for i in range(1,15)]]

encoder.fit(nominal_data)

transformed_data = encoder.transform(nominal_data).toarray()

X_ready = np.hstack((transformed_data, X[['X'+str(i) for i in range(15,34)]].to_numpy()))

In [None]:
# Train different Classifiers

gscv = GridSearchCV(
    estimator=SVC(),
    param_grid = {'kernel': ['rbf', 'linear'], 'C':[0.1, 1, 10, 100]},
    cv=StratifiedKFold(n_splits=5)
)

gscv.fit(X_ready, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
print(gscv.best_score_)

best_cls = gscv.best_estimator_

0.6975


In [None]:
# perform the same preprocessing on test data

test_data = read_dataset('test.csv', test=True)

test_data = fill_missing_values(test_data)

nominal_data = test_data[['X'+str(i) for i in range(1,15)]]

transformed_data = encoder.transform(nominal_data).toarray()

X_test_ready = np.hstack((transformed_data, test_data[['X'+str(i) for i in range(15,34)]].to_numpy()))

   X1  X2  X3  X4  X5  X6  X7  ...    X27    X28    X29    X30    X31    X32    X33
0  V2  V1  V1  V1  V2  V1  V1  ... -0.092  0.182  2.751 -0.394  0.401  0.724  0.216
1  V2  V1  V1  V1  V2  V1  V1  ... -0.092  1.098  1.393 -0.394  0.401  0.724  0.216
2  V2  V1  V1  V1  V2  V1  V1  ... -0.092  0.548 -0.782  0.050  0.401  0.018  0.216
3  V2  V1  V1  V1  V2  V1  V1  ... -0.092 -1.651 -1.325  0.050  0.401  0.018  0.216
4  V2  V1  V1  V2  V2  V1  V1  ... -0.092 -0.552  1.936 -0.394  0.401  0.018  0.216
5  V2  V1  V1  V2  V2  V1  V1  ... -0.092  0.182  0.034 -1.060  0.401  0.724  0.216
6  V1  V1  V1  V2  V2  V1  V1  ... -0.092  0.000  0.000  1.160  0.401  0.724  0.216
7  V2  V1  V1  V2  V2  V1  V1  ... -0.092  0.915  0.849 -0.394  0.401  0.724  0.216
8  V2  V1  V1  V2  V2  V1  V1  ... -0.092  0.915  0.849 -0.394  0.401 -0.605  0.216
9  V1  V1  V1  V1  V3  V1  V1  ... -0.092  0.182 -0.238  1.160  0.401  0.724  0.216

[10 rows x 33 columns]


In [None]:
# perform predictions using the best estimator on test data
best_cls.predict(X_test_ready)

array([-1, -1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,
       -1,  1,  1,  1,  1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,
       -1,  1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
        1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1,
        1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,  1,
       -1,  1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1],
      dtype=int32)