# Classification
Classificationis to identify which category a new observation belongs to, on the basis of a training dataset. There are five datasets. For each dataset, we provide the training dataset, training label, and test dataset. Please use the training dataset and training label to build your classifier and predict the test label. A class label is represented by an integer. For example, in the 1st dataset, there are 4 classes where 1 represents the 1st class, 2 represents the 2nd class, etc. Note that, there exist some missing values in some of the dataset (a missing entry is filled by 1.00000000000000e+99), please fill the missing values before perform your classification algorithm.

TrainData 1 contains 3312 features with 150 samples. Testdata1 contains 3312 features with 53 samples. There are 5 classes in this dataset.

TrainData 2 contains 9182 features with 100 samples. Testdata2 contains 9182 features with 74 samples. There are 11 classes in this dataset.

TrainData 3 contains 13  features with 6300 samples. Testdata3 contains 13 features with 2693 samples. There are 9 classes in this dataset.

TrainData 4 contains 112 features with 2547 samples. Testdata4 contains 112 features with 1092 samples. There are 9 classes in this dataset.

TrainData 5 contains 11 features with 1119 samples. Testdata5 contains 11 features with 480 samples. There are 6 classes in this dataset.

TrainData 6 contains 142 features with 612 samples. Testdata6 contains 142 features with 262 samples. This is not a classification problem. You are asked to predict the real value

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# K Nearest Neighbors Classification
from scipy.stats import mode

class K_Nearest_Neighbors_Classifier() :

    def __init__( self, K ) :
        self.K = K

# Function to store training set
    def fit( self, X_train, Y_train ) :
        self.X_train = X_train
        self.Y_train = Y_train
        self.m, self.n = X_train.shape
        
        # Function for prediction    
    def predict( self, X_test ) :
        self.X_test = X_test
        self.m_test, self.n = X_test.shape
        # initialize Y_predict
        Y_predict = np.zeros( self.m_test )
        for i in range( self.m_test ) :
            x = self.X_test[i]
            # find the K nearest neighbors from current test example
            neighbors = np.zeros( self.K )
            neighbors = self.find_neighbors( x )
            # most frequent class in K neighbors
            Y_predict[i] = mode( neighbors )[0][0]	
        return Y_predict
        
    # Function to find the K nearest neighbors to current test example
    def find_neighbors( self, x ) :
        # calculate all the euclidean distances between current
        # test example x and training set X_train
        euclidean_distances = np.zeros( self.m )
        for i in range( self.m ) :
            d = self.euclidean( x, self.X_train[i] )
            euclidean_distances[i] = d
            
        # sort Y_train according to euclidean_distance_array and
        # store into Y_train_sorted
        inds = euclidean_distances.argsort()
        Y_train_sorted = self.Y_train[inds]
        return Y_train_sorted[:self.K]
    
    # Function to calculate euclidean distance
                
    def euclidean( self, x, x_train ) :
        return np.sqrt( np.sum( np.square( x - x_train ) ) )

In [4]:
def get_accuracy(Y_test,Y_pred):
    correctly_classified = 0
    count = 0
    
    for count in range( np.size( Y_pred ) ) :
        if Y_test[count] == Y_pred[count] :
            correctly_classified += 1 
        count = count + 1
    accuracy = (correctly_classified / count) * 100
    return accuracy

In [5]:
from sklearn.model_selection import train_test_split
def data_split(X, Y):
    return train_test_split( X, Y, test_size = 0.2, random_state = 0 )

# Classification 1 

In [6]:
train1 = np.loadtxt('TrainData1.txt')
train1_label = np.loadtxt('TrainLabel1.txt')

In [7]:
train1.shape

(150, 3312)

In [8]:
# Handling missing values by column wise mean imputation

col_avg = []
for i in range(train1.shape[1]):
    tot = 0
    count = 0
    for j in range(train1.shape[0]):
        if train1[j][i] == 1e+99:
            pass
        else:
            tot += train1[j][i]
            count += 1
    avg = tot / count
    col_avg.append(avg)

for i in range(train1.shape[1]):
    for j in range(train1.shape[0]):
        if train1[j][i] == 1e+99:
            train1[j][i] = col_avg[i]

In [9]:
# Testing model without normalization

X_train, X_test, Y_train, Y_test = data_split(train1, train1_label)
model = K_Nearest_Neighbors_Classifier( K = 1 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  93.33333333333333


In [10]:
# Testing model after normalization

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
X_train_encoded = enc.fit_transform(train1)
X_train, X_test, Y_train, Y_test = data_split(X_train_encoded, train1_label)
model = K_Nearest_Neighbors_Classifier( K = 1 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  96.66666666666667


In [11]:
model1 = KNeighborsClassifier()

In [12]:
# Using Grid Search to find the best KNN model with optimal K and distance metric
from sklearn.model_selection import GridSearchCV
parameters = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21], 'metric':['minkowski', 'euclidean'], 'weights':['uniform', 'distance']}
grid = GridSearchCV(model1, parameters, scoring = 'accuracy')

grid.fit(X_train, Y_train)

accuracy = grid.best_score_
print(accuracy)

grid.best_params_

0.9583333333333334


{'metric': 'minkowski', 'n_neighbors': 3, 'weights': 'uniform'}

In [13]:
best = grid.best_estimator_
best.score(X_test, Y_test)

0.9

In [14]:
test1 = np.loadtxt('TestData1.txt')
test1 = enc.fit_transform(test1)
pred = model.predict(test1)
np.savetxt("GelliClassification1.txt", pred, fmt='%d')

# Classification 2 :

In [15]:
# Loading Dataset 2

train2 = np.loadtxt('TrainData2.txt')
train2_label = np.loadtxt('TrainLabel2.txt')
X_train_encoded2 = enc.fit_transform(train2)
print(train2.shape, train2_label.shape)

(100, 9182) (100,)


In [16]:
# Handling missing values by column wise mean imputation
col_avg = []
for i in range(train2.shape[1]):
    tot = 0
    count = 0
    for j in range(train2.shape[0]):
        if train2[j][i] == 1e+99:
            pass
        else:
            tot += train2[j][i]
            count += 1
    avg = tot / count
    col_avg.append(avg)

for i in range(train2.shape[1]):
    for j in range(train2.shape[0]):
        if train2[j][i] == 1e+99:
            train2[j][i] = col_avg[i]

In [17]:
# Testing model without normalization

X_train, X_test, Y_train, Y_test = data_split(train2, train2_label)
model = K_Nearest_Neighbors_Classifier( K = 1 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  95.0


In [18]:
# Testing model after normalization

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_encoded2 = scaler.fit_transform(train2)
X_train, X_test, Y_train, Y_test = data_split(X_train_encoded2, train2_label)
model = K_Nearest_Neighbors_Classifier( K = 1 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  100.0


In [19]:
parameters = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21], 'metric':['minkowski', 'euclidean'], 'weights':['uniform', 'distance']}
grid = GridSearchCV(model1, parameters, scoring = 'accuracy')

grid.fit(X_train, Y_train)

accuracy = grid.best_score_
print(accuracy)

print(grid.best_params_)

best = grid.best_estimator_
best.score(X_test, Y_test)

0.85
{'metric': 'minkowski', 'n_neighbors': 1, 'weights': 'uniform'}


1.0

In [20]:
test2 = np.loadtxt('TestData2.txt')

In [21]:
test2 = scaler.fit_transform(test2)
pred = model.predict(test2)

np.savetxt("GelliClassification2.txt", pred, fmt='%d')

# Classification 3

In [22]:
train3 = np.loadtxt('TrainData3.txt')
train3_label = np.loadtxt('TrainLabel3.txt')

In [23]:
# Handling missing values by column wise mean imputation

col_avg = []
for i in range(train3.shape[1]):
    tot = 0
    count = 0
    for j in range(train3.shape[0]):
        if train3[j][i] == 1e+99:
            pass
        else:
            tot += train3[j][i]
            count += 1
    avg = tot / count
    col_avg.append(avg)

for i in range(train3.shape[1]):
    for j in range(train3.shape[0]):
        if train3[j][i] == 1e+99:
            train3[j][i] = col_avg[i]
print(train3.shape)
print(len(col_avg))
print(count)

(6300, 13)
13
6102


In [24]:
# Testing model without normalization
from sklearn.utils import shuffle
train3, train3_label = shuffle(train3, train3_label)
X_train, X_test, Y_train, Y_test = data_split(train3, train3_label)
model = K_Nearest_Neighbors_Classifier( K = 21 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  32.22222222222222


In [25]:
# Testing model after normalization

X_train_encoded3 = enc.fit_transform(train3)
X_train, X_test, Y_train, Y_test = data_split(X_train_encoded3, train3_label)
model = K_Nearest_Neighbors_Classifier( K = 21 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  30.476190476190478


In [26]:
parameters = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21], 'metric':['minkowski', 'euclidean'], 'weights':['uniform', 'distance']}
grid = GridSearchCV(model1, parameters, scoring = 'accuracy')

grid.fit(X_train, Y_train)

accuracy = grid.best_score_
print(accuracy)

print(grid.best_params_)

best = grid.best_estimator_
best.score(X_test, Y_test)

0.3253968253968254
{'metric': 'minkowski', 'n_neighbors': 19, 'weights': 'uniform'}


0.30714285714285716

In [27]:
import pandas as pd
df = pd.read_csv('TestData3.txt', header=None)

In [28]:
test3 = df.to_numpy()
test3.shape

(2693, 13)

In [29]:
test3 = enc.fit_transform(train3)
pred = model.predict(test3)

np.savetxt("GelliClassification3.txt", pred, fmt='%d')

# Classification 4

In [30]:
train4 = np.loadtxt('TrainData4.txt')
train4_label = np.loadtxt('TrainLabel4.txt')

In [31]:
# Handling missing values by column wise mean imputation

col_avg = []
for i in range(train4.shape[1]):
    tot = 0
    count = 0
    for j in range(train4.shape[0]):
        if train4[j][i] == 1e+99:
            pass
        else:
            tot += train4[j][i]
            count += 1
    avg = tot / count
    col_avg.append(avg)

for i in range(train4.shape[1]):
    for j in range(train4.shape[0]):
        if train4[j][i] == 1e+99:
            train4[j][i] = col_avg[i]
print(train4.shape)
print(len(col_avg))
print(count)

(2547, 112)
112
2547


In [32]:
# Testing model without normalization

train4, train4_label = shuffle(train4, train4_label)
X_train, X_test, Y_train, Y_test = data_split(train4, train4_label)
model = K_Nearest_Neighbors_Classifier( K = 5 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  73.92156862745098


In [33]:
# Testing model after normalization

X_train_encoded4 = enc.fit_transform(train4)
X_train, X_test, Y_train, Y_test = data_split(X_train_encoded4, train4_label)
model = K_Nearest_Neighbors_Classifier( K = 3 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  83.72549019607844


In [34]:
parameters = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21], 'metric':['minkowski', 'euclidean'], 'weights':['uniform', 'distance']}
grid = GridSearchCV(model1, parameters, scoring = 'accuracy')

grid.fit(X_train, Y_train)

accuracy = grid.best_score_
print(accuracy)

print(grid.best_params_)

best = grid.best_estimator_
best.score(X_test, Y_test)

0.8384761767114709
{'metric': 'minkowski', 'n_neighbors': 3, 'weights': 'distance'}


0.8470588235294118

In [35]:
test4 = np.loadtxt('TestData4.txt')

In [36]:
test4 = enc.fit_transform(test4)
pred = model.predict(test4)
np.savetxt("GelliClassification4.txt", pred, fmt='%d')

# Classification 5

In [51]:
# Loading Dataset 5

train5 = np.loadtxt('TrainData5.txt')
train5_label = np.loadtxt('TrainLabel5.txt')
train5, train5_label = shuffle(train5, train5_label)

In [52]:
# Handling missing values by column wise mean imputation

col_avg = []
for i in range(train5.shape[1]):
    tot = 0
    count = 0
    for j in range(train5.shape[0]):
        if train5[j][i] == 1e+99:
            pass
        else:
            tot += train5[j][i]
            count += 1
    avg = tot / count
    col_avg.append(avg)

for i in range(train5.shape[1]):
    for j in range(train5.shape[0]):
        if train5[j][i] == 1e+99:
            train5[j][i] = col_avg[i]
print(train5.shape)
print(len(col_avg))
print(count)

(1119, 11)
11
1119


In [53]:
# Testing model without normalization

X_train, X_test, Y_train, Y_test = data_split(train5, train5_label)
model = K_Nearest_Neighbors_Classifier( K = 1 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  47.767857142857146


In [54]:
# Testing model after normalization

X_train_encoded5 = enc.fit_transform(train5)
X_train, X_test, Y_train, Y_test = data_split(X_train_encoded5, train5_label)
model = K_Nearest_Neighbors_Classifier( K = 1 )
model.fit( X_train, Y_train )
Y_pred = model.predict( X_test )
print("Accuracy is : ", get_accuracy(Y_test, Y_pred))

Accuracy is :  61.60714285714286


In [55]:
parameters = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21], 'metric':['minkowski', 'euclidean'], 'weights':['uniform']}
grid = GridSearchCV(model1, parameters, scoring = 'accuracy')
grid.fit(X_train, Y_train)
accuracy = grid.best_score_
print(accuracy)
print(grid.best_params_)

best = grid.best_estimator_
best.score(X_test, Y_test)

0.5787709497206703
{'metric': 'minkowski', 'n_neighbors': 1, 'weights': 'uniform'}


0.6160714285714286

In [56]:
test5 = np.loadtxt('TestData5.txt')

In [57]:
test5 = enc.fit_transform(test5)
pred = model.predict(test5)
np.savetxt("GelliClassification5.txt", pred, fmt='%d')

# Classification 6 (Regression)

In [44]:
train6 = np.loadtxt('TrainData6.txt')
train6_label = np.loadtxt('TrainLabel6.txt')
#X_train_encoded6 = enc.fit_transform(train6)

In [45]:
# Building a Regression Model
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=3)
neigh.fit(train6[:500], train6_label[:500])
y_pred = neigh.predict(train6[500:])
neigh.score(train6[500:], train6_label[500:])

0.17753144035150115

In [46]:
# Building a Regression Model

from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(train6[:500], train6_label[:500])

regr.score(train6[500:], train6_label[500:])

0.3014139366273568

In [47]:
# Building a Regression Model

from sklearn.ensemble import GradientBoostingRegressor
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0).fit(train6[:500], train6_label[:500])

est.score(train6[500:], train6_label[500:])

0.3389989738759125

In [48]:
# Building a Regression Model

from sklearn.linear_model import RidgeCV
clf = RidgeCV(alphas=[1e-1, 1, 10, 100, 1000, 10000]).fit(train6[:500], train6_label[:500])
clf.score(train6[500:], train6_label[500:])

0.46701715715238457

In [49]:
test6 = np.loadtxt('TestData6.txt')

In [50]:
pred = clf.predict(test6)
np.savetxt("GelliClassification6.txt", pred, fmt='%d')