<a href="https://colab.research.google.com/github/Sujit27/dummy/blob/main/train_and_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This notebook fits the training data and does inference on the validation set ##

In [10]:
import numpy
import pandas as pd
import sklearn
import os

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import seaborn as sn
from numpy import genfromtxt

In [13]:
def evaluate(input_csv_file,label_csv_file,method='logistic_regression'):
  '''
  Function that takes as input the training X and y arrays as csv files
  and returns confusion matrix and F1-score
  '''
  X = genfromtxt(input_csv_file, delimiter=',')
  Y = genfromtxt(label_csv_file, delimiter=',')

  X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
  if method == 'logistic_regression':
    model = LogisticRegression(max_iter=10000)

  elif  method == 'random_forest':
    model = RandomForestClassifier(max_depth=2, random_state=0)

  else:
    print("method not defined")
    return 0

  model.fit(X_train,y_train)
  y_pred=model.predict(X_test)

  conf_mat = confusion_matrix(y_test,y_pred)
  score = f1_score(y_test,y_pred)

  return conf_mat, score

### Evaluate models on raw and preprocessed training data ###

In [14]:
input_csv_file = 'drive/MyDrive/arya.ai-assignment/x_raw.csv'
label_csv_file = 'drive/MyDrive/arya.ai-assignment/y_raw.csv'
conf_mat, score = evaluate(input_csv_file,label_csv_file)
print(conf_mat)
print(score)

[[469  25]
 [ 27 261]]
0.9094076655052266


In [15]:
input_csv_file = 'drive/MyDrive/arya.ai-assignment/x_normalized.csv'
label_csv_file = 'drive/MyDrive/arya.ai-assignment/y_raw.csv'
conf_mat, score = evaluate(input_csv_file,label_csv_file)
print(conf_mat)
print(score)

[[468  26]
 [ 51 237]]
0.8602540834845734


In [16]:
input_csv_file = 'drive/MyDrive/arya.ai-assignment/x_standardized.csv'
label_csv_file = 'drive/MyDrive/arya.ai-assignment/y_raw.csv'
conf_mat, score = evaluate(input_csv_file,label_csv_file)
print(conf_mat)
print(score)

[[470  24]
 [ 29 259]]
0.9071803852889666


In [17]:
input_csv_file = 'drive/MyDrive/arya.ai-assignment/x_selected.csv'
label_csv_file = 'drive/MyDrive/arya.ai-assignment/y_raw.csv'
conf_mat, score = evaluate(input_csv_file,label_csv_file)
print(conf_mat)
print(score)

[[468  26]
 [ 38 250]]
0.8865248226950354


In [19]:
input_csv_file = 'drive/MyDrive/arya.ai-assignment/x_pca.csv'
label_csv_file = 'drive/MyDrive/arya.ai-assignment/y_raw.csv'
conf_mat, score = evaluate(input_csv_file,label_csv_file)
print(conf_mat)
print(score)

[[449  45]
 [144 144]]
0.6037735849056604


In [18]:
input_csv_file = 'drive/MyDrive/arya.ai-assignment/x_raw.csv'
label_csv_file = 'drive/MyDrive/arya.ai-assignment/y_raw.csv'
conf_mat, score = evaluate(input_csv_file,label_csv_file,method='random_forest')
print(conf_mat)
print(score)

[[478  16]
 [ 70 218]]
0.8352490421455939
