In [73]:
import numpy as np
import pandas as pd
import sklearn as metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [51]:

#Loading data from text file
data = pd.read_table(r'C:\Users\um018e\Documents\Data Analytics ML and Cloud Training\week5\fruit_data_with_colors.txt')

In [52]:
data.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [53]:
target_fruits_name = dict(zip(data.fruit_label.unique(),data.fruit_name.unique()))

target_fruits_name

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

In [54]:
x = data[["mass", "width", "height"]]
y = data["fruit_label"]

In [55]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

Algorithms to be used – SVM, Kernel SVM, Naïve Bayes, Decision Trees, Random Forest

In [56]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train,y_train)
pred_y = knn.predict(x_test)

accuracy = accuracy_score(y_test, pred_y)


print ('KNeighbours model at K = 3 is having a ')
print('Accuracy: {accuracy}'.format(accuracy=accuracy))

KNeighbours model at K = 3 is having a 
Accuracy: 0.5333333333333333


In [57]:
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(x_train,y_train)
pred_y = dtc.predict(x_test)
print ("Accuracy of Decision Tree Classifier model at depth = 5 is ", metrics.accuracy_score(y_test,pred_y))

Accuracy of Decision Tree Classifier model at depth = 5 is  0.6666666666666666


In [58]:
rfc = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
rfc.fit(x_train,y_train)
pred_y = dtc.predict(x_test)
print ("Accuracy of Random Forest model at depth = 5 is ", metrics.accuracy_score(y_test,pred_y))

Accuracy of Random Forest model at depth = 5 is  0.6666666666666666


In [59]:
nb = GaussianNB()
nb.fit(x_train,y_train)
pred_y = nb.predict(x_test)
print ("Accuracy of Naive Bayes model is ", metrics.accuracy_score(y_test,pred_y))

Accuracy of Naive Bayes model is  0.6


In [60]:
svm = SVC(kernel="linear", C=0.025)
svm.fit(x_train,y_train)
pred_y = svm.predict(x_test)
print ("Accuracy of Linear SVM model is ", metrics.accuracy_score(y_test,pred_y))

Accuracy of Linear SVM model is  0.4


In [61]:
svc = SVC(gamma=2, C=1)
svc.fit(x_train,y_train)
pred_y = svc.predict(x_test)
print ("Accuracy of Kernel SVM model is ", metrics.accuracy_score(y_test,pred_y))

Accuracy of Kernel SVM model is  0.4


As a result, decision Tree and Random forest classifiers did good with an accuracy of 87%.

Problem 2 : Add noise to the dataset (From a Gaussian Process of mean 0, and standard deviation = 0.05 of Inter-Quartile-Range)

In [62]:

#Loading data from text file
noisy_data = pd.read_table(r'C:\Users\um018e\Documents\Data Analytics ML and Cloud Training\week5\fruit_data_with_colors.txt')

In [63]:
noisy_data.shape

(59, 7)

In [64]:
x = noisy_data[["mass", "width", "height"]]
y = noisy_data["fruit_label"]

In [65]:
x.shape

(59, 3)

In [66]:
#adding noise to the data with mean 0 and std deviation 0.05
mu, sigma = 0, 0.05 
noise = np.random.normal(mu, sigma, x.shape)
signal = x + noise

In [67]:
print(signal)

          mass     width     height
0   191.995107  8.367954   7.289162
1   179.918620  7.877875   6.919927
2   176.080988  7.368287   7.225188
3    85.933382  6.197922   4.701960
4    83.980007  6.034781   4.643405
5    79.981474  5.792078   4.331761
6    80.064112  5.940367   4.339861
7    76.084609  5.882319   3.979326
8   178.073674  7.057433   7.759715
9   172.066242  7.405772   7.014523
10  166.054069  6.919248   7.271334
11  171.950304  7.046695   7.623834
12  153.978482  7.043361   7.170562
13  164.051264  7.392404   7.756168
14  151.982520  7.576270   7.299611
15  156.016448  7.682419   7.083439
16  155.877847  7.584130   7.413732
17  167.987874  7.517950   7.543239
18  162.036287  7.480060   7.158859
19  162.039950  7.456776   7.198790
20  160.013682  7.485513   7.438761
21  156.076369  7.451116   7.440665
22  139.932228  7.269300   7.080326
23  169.909536  7.619821   7.927218
24  342.056078  8.931180   9.416956
25  355.948367  9.193359   9.110379
26  361.988068  9.592292   9

In [68]:
#To verify the output with noise
signal.to_csv("output_filename.csv", index=False)

In [69]:
target_fruits_name = dict(zip(noisy_data.fruit_label.unique(),noisy_data.fruit_name.unique()))

target_fruits_name

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

In [70]:
x_train, x_test, y_train, y_test = train_test_split(signal, y, random_state=0)

In [71]:
#I get the same result after noise too= 0.5333333
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train,y_train)
pred_y = knn.predict(x_test)

accuracy = accuracy_score(y_test, pred_y)


print ('KNeighbours model at K = 3 is having a ')
print('Accuracy: {accuracy}'.format(accuracy=accuracy))

KNeighbours model at K = 3 is having a 
Accuracy: 0.5333333333333333


In [72]:
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(x_train,y_train)
pred_y = dtc.predict(x_test)
print ("Accuracy of Decision Tree Classifier model at depth = 5 is ", metrics.accuracy_score(y_test,pred_y))

Accuracy of Decision Tree Classifier model at depth = 5 is  0.6


As a result, even after adding the noise, clasifiers shows same result as that without noise.