In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd


#load the data from 'TrainingData.txt' into a dataframe
df = pd.read_csv('TrainingData.txt',sep=',', header=None)

#create two data sets
guideline_price_set = df.iloc[:,:24]    #data set for the guideline prices
labels_set = df.iloc[:,-1]              #data set for the labels of the guideline prices



Split the guideline prices and the labels into two separate sets for training and validation. The training sets will be 80% of the original set and the validating sets the rest 20%

In [2]:
#creating 4 new data sets.  guideline_train = 80% of guideline_price_set,   labels_train = 80% of labels_set
#                            guideline_test = 20& of guideline_price_set,   labels_test = 20% of labels_set
guideline_train,guideline_test,labels_train,labels_test = train_test_split(guideline_price_set,labels_set, test_size=0.2, random_state=0)


Format the data so I can use it in the classifiers

In [3]:
#Change the data sets into arrays
guideline_train = np.asarray(guideline_train)
guideline_test = np.asarray(guideline_test)

labels_train = np.asarray(labels_train)
labels_test = np.asarray(labels_test)



Tried three different classifiers to see which one would has the best accuracy.

Support Vector Classifier

In [4]:
svc_classifer = SVC()
svc_classifer.fit(guideline_train,labels_train)
svc_predict = svc_classifer.predict(guideline_test)

svc_accuracy = accuracy_score(labels_test, svc_predict)

print("Support Vector Accuracy: " , svc_accuracy)

Support Vector Accuracy:  0.95


Random Forest Classifier

In [5]:
rf_classifer = RandomForestClassifier()
rf_classifer.fit(guideline_train,labels_train)
rf_predict = rf_classifer.predict(guideline_test)

rf_accuracy = accuracy_score(labels_test, rf_predict)

print("Random Forest Accuracy: " , rf_accuracy)

Random Forest Accuracy:  0.891


Gaussian Naïve Bayes Classifier

In [6]:
naive_classifer = GaussianNB()
naive_classifer.fit(guideline_train,labels_train)
naive_predict = naive_classifer.predict(guideline_test) 

naive_accuracy = accuracy_score(labels_test, naive_predict)

print("Gaussian Naïve Bayes Accuracy: " , naive_accuracy)

Gaussian Naïve Bayes Accuracy:  0.94


K Nearest Neighbors Classifier

In [7]:
kn_classifer = KNeighborsClassifier()
kn_classifer.fit(guideline_train,labels_train)
kn_predict = kn_classifer.predict(guideline_test) 

kn_accuracy = accuracy_score(labels_test, kn_predict)

print("K Nearest Neighbors Accuracy: " , kn_accuracy)

K Nearest Neighbors Accuracy:  0.803


Reading the "TestingData.txt" data set in order to predict the test labels

In [8]:
test_data = pd.read_csv('TestingData.txt',sep=',', header=None)
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,4.512853,3.436581,3.682556,3.062718,3.456278,4.028039,3.53047,4.29234,5.018994,4.78319,...,5.857223,6.381602,6.115519,6.294756,6.513145,5.25019,5.913805,5.123827,5.629438,5.753545
1,4.038201,3.874221,3.120743,3.261643,2.990717,3.789115,3.935849,4.391824,5.356575,5.274408,...,5.822269,6.206444,5.631747,6.631983,6.593441,5.643768,5.930986,5.421773,5.150519,5.126661
2,4.343619,3.254605,3.13029,3.583647,3.021889,3.95301,4.379102,4.432525,5.478115,4.914986,...,5.284374,6.252251,5.771876,6.023015,6.340118,5.398613,5.176215,5.193332,5.543661,5.148262
3,4.215916,3.310803,3.144808,2.826131,2.979042,3.264118,3.640009,4.332902,5.842453,4.560108,...,5.437254,6.420271,6.014269,6.44724,6.788652,5.721373,5.923394,4.989043,5.797218,5.723375
4,4.607662,3.688971,3.804653,2.882434,3.247362,3.393615,3.720861,3.600273,5.53635,4.653512,...,5.402382,5.784929,6.340968,6.649603,6.720808,5.392021,5.130296,5.532805,6.028962,5.496418


Predictions Results for Support Vector Classifier

Prediction Results for Gaussian Naïve Bayes Classifier

In [10]:
naive_classifer.fit(guideline_price_set, labels_set)
predicted_labels = naive_classifer.predict(test_data)
print(len(predicted_labels))

sum = 0
for i in range (len(predicted_labels)):
    print(i, ' => ' , predicted_labels[i] , '\t')
    if(predicted_labels[i] == 1):
        sum += 1

print (sum)

100
0  =>  0 	
1  =>  0 	
2  =>  0 	
3  =>  0 	
4  =>  1 	
5  =>  1 	
6  =>  0 	
7  =>  0 	
8  =>  1 	
9  =>  0 	
10  =>  0 	
11  =>  1 	
12  =>  1 	
13  =>  0 	
14  =>  1 	
15  =>  1 	
16  =>  1 	
17  =>  1 	
18  =>  1 	
19  =>  1 	
20  =>  0 	
21  =>  1 	
22  =>  0 	
23  =>  0 	
24  =>  0 	
25  =>  1 	
26  =>  0 	
27  =>  1 	
28  =>  0 	
29  =>  1 	
30  =>  1 	
31  =>  1 	
32  =>  1 	
33  =>  0 	
34  =>  1 	
35  =>  0 	
36  =>  0 	
37  =>  1 	
38  =>  1 	
39  =>  0 	
40  =>  1 	
41  =>  0 	
42  =>  0 	
43  =>  1 	
44  =>  1 	
45  =>  1 	
46  =>  0 	
47  =>  1 	
48  =>  1 	
49  =>  0 	
50  =>  1 	
51  =>  0 	
52  =>  0 	
53  =>  1 	
54  =>  1 	
55  =>  1 	
56  =>  0 	
57  =>  1 	
58  =>  1 	
59  =>  0 	
60  =>  0 	
61  =>  1 	
62  =>  1 	
63  =>  1 	
64  =>  1 	
65  =>  0 	
66  =>  0 	
67  =>  1 	
68  =>  1 	
69  =>  1 	
70  =>  0 	
71  =>  0 	
72  =>  0 	
73  =>  1 	
74  =>  1 	
75  =>  0 	
76  =>  1 	
77  =>  0 	
78  =>  1 	
79  =>  1 	
80  =>  0 	
81  =>  1 	
82  =>  0 	
83  =>  1 