In [1]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)          #some code samples return useless future warnings, this suppresses them

training = pd.read_csv("data/train_set.csv")
test = pd.read_csv("data/test_set.csv")
test.dropna(inplace=True)
test.drop(columns="Unnamed: 0",inplace=True)
training.drop(columns="Unnamed: 0",inplace=True)
training.dropna(inplace=True)

In [2]:
vals = [0]*len(training.columns)
for i,col in enumerate(training.columns):
    vals[i] = [col] + [list(training[col].unique())]
print(vals)

[['buying', ['low', 'vhigh', 'med', 'high']], ['maint', ['med', 'vhigh', 'low', 'high']], ['doors', ['3', '2', '5more', '4']], ['persons', ['4', '2', 'more']], ['lug_boot', ['small', 'med', 'big']], ['safety', ['high', 'low', 'med']], ['rating', ['good', 'unacc', 'vgood', 'acc']]]


In [3]:
subs = [['low','small','unacc','med','acc','big','high','good','vgood','vhigh','more','5more'],[0,0,0,1,1,2,2,2,3,3,5,5]]
training.replace(subs[0],subs[1],inplace=True)
test.replace(subs[0],subs[1],inplace=True)

for col in training.columns:
    training[col] = pd.to_numeric(training[col])
    test[col] = pd.to_numeric(test[col])

# Task 1
first search

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from math import sqrt
NFOLDS = 5
predictorCols = ['buying','maint','doors','persons','lug_boot','safety']
X = training[predictorCols]
y = training['rating']

nTrees = [n**2 for n in range(3,16,3)]
maxDepth = [None] + list(range(3,16,3))
maxFeaturesSplit = list(range(1,len(X.columns)+1))

searchParametersCoarse = {'n_estimators': nTrees,'max_depth':maxDepth,'max_features':maxFeaturesSplit}

gridSearchKwargs = {'cv':5,'verbose':2,'n_jobs':-1,'scoring':'accuracy','return_train_score':True}
rfc = RandomForestClassifier()
model = GridSearchCV(rfc,searchParametersCoarse,**gridSearchKwargs)
model = model.fit(X,y)
accuracies = pd.DataFrame(model.cv_results_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:   56.7s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  1.4min finished


Generating finer parameters

In [5]:
bestVals = model.best_params_
bestDepth = bestVals['max_depth']
bestNTrees = int(sqrt(bestVals['n_estimators']))

if(bestDepth is None):
    maxDepth = [None]
else:
    maxDepth = [max(i+bestDepth,1) for i in range(-2,3)]
nTrees = [(bestNTrees + i)**2 for i in range(-2,3)]
searchParametersFine = {'n_estimators': nTrees,'max_depth':maxDepth,'max_features':maxFeaturesSplit}

Second search

In [6]:
model = GridSearchCV(rfc,searchParametersFine,**gridSearchKwargs)
model = model.fit(X,y)
accuracies = accuracies.append(pd.DataFrame(model.cv_results_))
relevantColumns = ['param_max_depth','param_max_features','param_n_estimators','mean_test_score','mean_train_score']
accuracies = accuracies[relevantColumns].replace([None],[0])

Fitting 5 folds for each of 150 candidates, totalling 750 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 713 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 727 out of 750 | elapsed:   13.6s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:   13.9s finished


# Task 2

In [7]:
from sklearn.svm import LinearSVC
import numpy as np
linearSVC = LinearSVC(dual=False)
Cs = np.logspace(-2,3,15)
losses = ['hinge','squared_hinge']
penalties = ['l1','l2']
searchParametersCoarse = {'C':Cs,'penalty':penalties}
model = GridSearchCV(linearSVC,searchParametersCoarse,**gridSearchKwargs)
model = model.fit(X,y)
accuracies = pd.DataFrame(model.cv_results_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    8.0s finished


In [8]:
from math import log

bestVals = model.best_params_
bestC = log(bestVals['C'],10)

Cs = np.logspace(bestC-0.5,bestC+0.5,15)

searchParametersFine = {'C':Cs,'penalty':penalties}

In [9]:
model = GridSearchCV(linearSVC,searchParametersFine,**gridSearchKwargs)
model = model.fit(X,y)
accuracies = accuracies.append(pd.DataFrame(model.cv_results_))
relevantColumns = ['param_C','param_penalty','mean_test_score','mean_train_score']
accuracies = accuracies[relevantColumns]

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    7.8s finished


In [10]:
model.best_params_

{'C': 0.22758459260747887, 'penalty': 'l2'}

# Task 3

In [21]:
from sklearn.preprocessing import MinMaxScaler
nnScaler = MinMaxScaler()

nnScaler = nnScaler.fit(X)

In [43]:
import torch
import torch.nn as nn

useCuda = torch.cuda.is_available()

nPredictors = 6
nRatings = 4
nEpochs = 120
batchSize = 64
learningRate = 1e-2

class NNShallow(nn.Module):
    def __init__(self):
        super(NNShallow, self).__init__()
        self.layer1 = nn.Linear(nPredictors, 200)
        self.layer2 = nn.Linear(200, 200)
        self.layer3 = nn.Linear(200, nRatings)
        self.relu = nn.ReLU()
        
    
    def forward(self, x):
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        out = self.relu(out)
        out = self.layer3(out)
        return out

    def reset(self):
        def resetWeight(m):
            if isinstance(m, nn.Linear):
                m.reset_parameters()
        self.apply(resetWeight)

class NNDataset(torch.utils.data.Dataset):
    def __init__(self,X,y):
        nny = np.array(y,dtype=int)
        self.X = torch.from_numpy(nnScaler.transform(X)).float()
        self.y = torch.from_numpy(nny).long()

        if useCuda:
            self.X = self.X.cuda()
            self.y = self.y.cuda()
    
    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self,index):
        return self.X[index],self.y[index]


In [44]:
nnDataset = NNDataset(X,y)
network = NNShallow()
criterion = nn.CrossEntropyLoss()
if useCuda:
    network.cuda()
def getOptLoad(lr,bSize):
    optimiser = torch.optim.SGD(network.parameters(),lr=lr)
    loader = torch.utils.data.DataLoader(dataset=nnDataset,batch_size=bSize,shuffle=True)
    return optimiser, loader
optimiser, loader = getOptLoad(learningRate,batchSize)

In [25]:
for epoch in range(num_epochs):
    for predictors, ratings in loader:

        optimiser.zero_grad()                             # Intialize the hidden weight to all zeros
        outputs = network(predictors)                     # Forward pass: compute the output class given a image
        loss = criterion(outputs, ratings)                # Compute the loss: difference between the output class and the pre-given label
        loss.backward()                                   # Backward pass: compute the weight
        optimiser.step()                                  # Optimizer: update the weights of hidden nodes
        
    if (epoch+1) % 5 == 0:
        print('\rEpoch [%d/%d]'%(epoch+1, num_epochs),end='')


Epoch [120/120]

In [45]:
network.reset()