# Machine Learning Report

In [25]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

import warnings

## Contents
- Loading the data
- Experimenting with the data
    - PCA
    - Regularisation
        - Using Preprocessing
        - Using MinMax Scaling
- Deep Learning Approach
- Classical Learning Approaches
    - Supervised Learning
        - K-Nearest Neighbours
        - Support Vector Machine
    - Unsupervised Learning
        - K-Means

## Loading the data

In [2]:
atlas_labels = pd.read_csv("ATLAS-labels.csv", header=None)
atlas_data = pd.read_csv("ATLAS-data.csv", header=None)

In [3]:
atlas_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,243.128,96.455,158.618,90.253,0.693,56.238,1.581,3.615,13.597,264.586,...,2.121,376.41,3.0,46.248,-0.958,-0.444,42.759,-1.651,-1.445,157.638
1,139.751,74.028,111.619,44.266,2.119,104.723,-0.498,2.533,9.437,168.638,...,0.447,252.698,2.0,36.798,0.269,-1.254,31.816,-1.849,0.664,68.614
2,116.217,73.136,59.239,101.85,0.625,85.539,2.979,1.922,28.049,202.636,...,1.816,271.485,2.0,82.453,2.066,-1.21,58.216,1.442,-2.288,140.669
3,64.544,0.099,28.945,341.418,2.39,358.936,-1.181,0.714,35.293,508.349,...,-2.409,470.806,3.0,331.152,-1.693,0.615,41.742,0.697,1.083,409.977
4,32.281,67.393,26.006,29.088,1.33,108.191,-0.362,0.846,48.389,152.087,...,-1.421,244.22,2.0,48.198,0.949,1.517,40.382,-0.382,-2.029,88.58


### Experimenting with the Data

#### Principal Component Analysis

In [23]:
scaler = StandardScaler()
scaler.fit(atlas_data)
atlas_processed = scaler.transform(atlas_data)

In [25]:
pca = PCA(.95)
pca.fit(atlas_processed)
atlas_processed = pca.transform(atlas_processed)
atlas_processed.shape

(10000, 17)

#### Reguralization

##### Using preprocessing

In [41]:
atlas_processed = preprocessing.scale(atlas_data)

##### Using MinMax Scaling

In [4]:
scaling = MinMaxScaler(feature_range=(-1,1)).fit(atlas_data)
atlas_processed = scaling.transform(atlas_data)

## Deep Learning Approach

## Classical Approaches

### Supervised

#### K-Nearest Neigbours

In [5]:
KNN_Data = atlas_processed
KNN_Labels = atlas_labels.to_numpy()

In [6]:
knn_train_values, knn_test_values, knn_train_labels, knn_test_labels = train_test_split(KNN_Data, KNN_Labels, test_size=0.2)

In [7]:
search_max_val = 10

In [8]:
warnings.simplefilter('ignore')
knn_model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree')
grid = {"n_neighbors":range(1,search_max_val)}
knn_model_cv = GridSearchCV(knn_model, grid, cv=10, iid=False, n_jobs=-1)
knn_model_cv.fit(KNN_Data, KNN_Labels)
knn_model_cv.best_params_

{'n_neighbors': 8}

In [9]:
knn_model_final = KNeighborsClassifier(n_neighbors=knn_model_cv.best_params_['n_neighbors'], algorithm='ball_tree')
knn_model_final.fit(knn_train_values, knn_train_labels)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                     weights='uniform')

In [10]:
knn_score = knn_model_final.score(knn_test_values, knn_test_labels)

In [11]:
print("Score for KNN is ",knn_score)

Score for KNN is  0.6815


#### Support Vector Machine - SVM

In [12]:
SVM_Data = atlas_processed
SVM_Labels = atlas_labels.to_numpy()

In [13]:
svm_train_values, svm_test_values, svm_train_labels, svm_test_labels = train_test_split(SVM_Data, SVM_Labels, test_size=0.2)

In [14]:
search_max_val = 10

In [15]:
svm_model_lin = SVC(kernel="linear")


In [16]:
grid = {"C":range(1,search_max_val)}

svm_model_cv = GridSearchCV(svm_model_lin, grid, cv=10, iid=False, n_jobs=-1)
svm_model_cv.fit(svm_train_values, svm_train_labels)
svm_model_cv.best_params_

{'C': 5}

In [17]:
svm_model_lin_final = SVC(C=svm_model_cv.best_params_['C'],kernel="linear")
svm_model_lin_final.fit(svm_train_values, svm_train_labels)
svm_lin_score = svm_model_lin_final.score(svm_test_values, svm_test_labels)
print("Score for Linear SVM is ",svm_lin_score)

Score for Linear SVM is  0.733


In [18]:
svm_model_rbf = SVC(kernel="rbf")

In [19]:
grid = {"C":range(1,search_max_val)}

svm_model_cv = GridSearchCV(svm_model_rbf, grid, cv=10, iid=False)
svm_model_cv.fit(svm_train_values, svm_train_labels)
svm_model_cv.best_params_

{'C': 9}

In [21]:
svm_model_rbf_final = SVC(C=svm_model_cv.best_params_['C'],kernel="linear")
svm_model_rbf_final.fit(svm_train_values, svm_train_labels)
svm_rbf_score = svm_model_rbf_final.score(svm_test_values, svm_test_labels)
print("Score for RBF SVM is ",svm_rbf_score)

Score for RBF SVM is  0.733


### Unsupervised

#### K-Means Clustering

In [22]:
KMEANS_Data = atlas_processed
KMEAN_Labels = atlas_labels.to_numpy()

In [23]:
numClusters = 2
iterations = 100

In [26]:
kmeans_model = KMeans(numClusters, max_iter=iterations)

In [27]:
kmeans_model.fit(KMEANS_Data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [44]:
cpy = atlas_labels.to_numpy()
counter = 0
for i in range(0, len(kmeans_model.labels_)):
    if(kmeans_model.labels_[i] == cpy[i]):
        counter += 1
score = counter / len(kmeans_model.labels_)
print(score)

0.5771
