In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
#Reading the data as a dataframe
df = pd.read_csv("waveform.data", sep=",", names=['d1', 'd2','d3','d4','d5','d6','d7','d8','d9','d10','d11','d12','d13','d14','d15','d16','d17','d18','d19','d20','d21','d22'])

In [4]:
# Splitting data to features and Traget Data
X = df.iloc[:, :-1].values
Y = df.iloc[:, 21].values

In [None]:
#Tuning the hyper parameter k
from sklearn.model_selection import GridSearchCV

#create a new knn model
knn2=KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors - check k values from 1 to 100
param_grid={'n_neighbors':np.arange(1,100)}

#Use gridsearch to test all values for n_neighbors
knn_gscv=GridSearchCV(knn2,param_grid,cv=5,return_train_score=True)

#fit model to data
knn_gscv.fit(X,Y)

print("The Best K Value is",knn_gscv.best_params_," With a score of ",knn_gscv.best_score_)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np


x = np.linspace(0, 100, 99)
y = knn_gscv.cv_results_['mean_test_score']

plt.axvline(x=58)
plt.axhline(y=0.8586)
plt.plot(x, y, '-ok', color='black');


In [None]:
knn_gscv.best_params_

In [None]:
knn_gscv.cv_results_['mean_test_score']

In [None]:
#Secound Step - Analysis of the bias-variance trade-off(Start with a very small training set(e.g 100 waves and a large k) and study the gap to the bayes error.)
result=[];k=100
for i in range(1,50):
    data=df.copy()
    data_train=data.sample(n=100*i,random_state=1)
    data_test=data.drop(data_train.index)
    #print("train shape is",data_train.shape," Testing shape is ",data_test.shape)
    data_train_x=data_train.iloc[:,:-1].values
    data_train_y=data_train.iloc[:,21].values
    data_test_x=data_test.iloc[:,:-1].values
    data_test_y=data_test.iloc[:,21].values
    
    #from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(data_train_x)
    X_train = scaler.transform(data_train_x)
    X_test = scaler.transform(data_test_x)
    
    classifier = KNeighborsClassifier(n_neighbors=k)
    classifier.fit(X_train, data_train_y)
    
    y_pred = classifier.predict(X_test)
    #print(" for traning explaes of ",100*i," and k value of ",k," the score is",accuracy_score(data_test_y, y_pred))
    result.append(accuracy_score(data_test_y, y_pred))
    k-=2
print(result)
len(result)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np


x = np.linspace(4, 100, 49)
y = result

plt.plot(x, y, '-ok', color='black');

In [None]:
# Third - Redyce the complexity with the CNN and RNN algorithms. Compare with the Original dataset.

#Performing RNN on the dataset
#Randomly dividing dataset into 2 subsets
RNN_data=df.copy()
RNN_Set1=RNN_data.sample(frac=0.5,random_state=0)
RNN_Set2=RNN_data.drop(RNN_Set1.index)

#Resetting Index for Set1 and Set2
RNN_Set1.reset_index(inplace=True,drop=True)
RNN_Set2.reset_index(inplace=True,drop=True)
S1_X=RNN_Set1.iloc[:,:-1].values
S1_Y=RNN_Set1.iloc[:,21].values
S2_X=RNN_Set2.iloc[:,:-1].values
S2_Y=RNN_Set2.iloc[:,21].values

In [None]:
classifier1=KNeighborsClassifier(n_neighbors=1)

#Function to train data set s1
def FitS1():
    scaler=StandardScaler()
    scaler.fit(S1_X)
    S1_Xtrain=scaler.transform(S1_X)
    classifier1.fit(S1_Xtrain,S1_Y)
    ypred=classifier1.predict(S2_X)
    return ypred
    
#Function to train data set S2
def FitS2():
    scaler = StandardScaler()
    scaler.fit(S2_X)
    S2_Xtrain=scaler.transform(S2_X)
    classifier1.fit(S2_Xtrain,S2_Y)
    ypred=classifier1.predict(S1_X)
    return ypred

In [None]:
#Algorithm for RNN
stabilization=0;t=0
while stabilization==0:
    print("iteration number ",t);t+=1;stabilization=1
    #resetting the index before every iteration
    RNN_Set1.reset_index(inplace=True,drop=True)
    RNN_Set2.reset_index(inplace=True,drop=True)
    #seperating features from target for S1 and S2
    S1_X=RNN_Set1.iloc[:,:-1].values
    S1_Y=RNN_Set1.iloc[:,21].values
    S2_X=RNN_Set2.iloc[:,:-1].values
    S2_Y=RNN_Set2.iloc[:,21].values
    #Algorithm for RNN
    ypred=FitS2()
    for i in range(len(ypred)):
        if ypred[i]!=S1_Y[i]:
            RNN_Set1.drop(i,inplace=True)
            stabilization=0
    ypred=FitS1()
    for i in range(len(ypred)):
        if ypred[i]!=S2_Y[i]:
            RNN_Set2.drop(i,inplace=True)
            stabilization=0
    if stabilization==1:print("reached stabilization")
RNN=pd.concat([RNN_Set1,RNN_Set2])

In [None]:
RNN.shape

In [None]:
#Initialization for CNN

#Resetting the index for RNN output
RNN_data.reset_index(inplace=True,drop=True)

#Retriving column names
columnlist=list(RNN.columns)

#Intializing Storage and Dustbin
Storage=pd.DataFrame(columns=columnlist)
Dustbin=pd.DataFrame(columns=columnlist)

#Intializing a random value
randompick=RNN.sample(n=1,random_state=1)
randompick.reset_index(inplace=True,drop=True)

#Initializing Storage
Storage=Storage.append(randompick)

#Splitting features from Target
X_set=RNN.iloc[:,:-1].values
Y_set=RNN.iloc[:,21].values

In [None]:
#Algorithm for CNN to train Storage
def FitStorage(Storage):
    Storage.reset_index(inplace=True,drop=True)
    classifier1=KNeighborsClassifier(n_neighbors=1)
    storage_x=Storage.iloc[:,:-1].values
    storage_y=Storage.iloc[:,21].values
    scaler=StandardScaler()
    scaler.fit(storage_x)
    train=scaler.transform(storage_x)
    classifier1.fit(train,storage_y)
    ypred=classifier1.predict(X_set)

In [None]:
#Algorithm for CNN
t=0;stabilization=0
while stabilization==0:
    stabilization=1
    print("Iteration ",t);t+=1
    print("Storage dimensions are ",Storage.shape," Dustbin Dimensions are ",Dustbin.shape)
    for i in range(len(RNN)):
        ypred=FitStorage(Storage)
        if(ypred[i]==Y_set[i]):
            Dustbin=Dustbin.append(RNN.iloc[i,:])
        else:
            Storage=Storage.append(RNN.iloc[i,:])
            stabilization=0
    if stabilization==1: print("Storage Stabilized")

In [None]:
Storage

In [None]:
RNN.tail(100)