In [10]:
import pandas as pd
import json as json
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn import cross_validation

# Helper function
def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]

# Using preprocessed data with rounded value of stars
df = pd.read_json('preprocessed_business_data_new.json')

# Eliminating nested features and irrelevant attributes
df_new = df.drop(['attributes','business_id', 'hours', 'index', 'working_type'] , axis=1)
cities = df['city'].unique()

# Encoding categorical attribute "city"
le = preprocessing.LabelEncoder()
df_new.city = le.fit_transform(df_new.city)

# Create labels
labels = df_new.loc[:,'stars']

# Create attributes dataframe
df_attrs = df_new.drop(['stars'] , axis=1)

# Create labels dataframe
df_labels = labels.to_frame()

# Create Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Using train_test_split with 20% of the data as test
[tr_data, te_data, 
 tr_labels, te_labels] = cross_validation.train_test_split(df_attrs, df_labels, test_size=0.2,random_state=42)

gnb.fit(tr_data, tr_labels.values.ravel())

# Get accuracy score
print "Accuracy score :", gnb.score(te_data, te_labels)

Accuracy score : 0.932095971028


Accuracy of Gaussian Naive Bayes classifier using train_test_split with 20% of the data as test : 0.932

In [11]:
foldnum = 0
fold_results = pd.DataFrame()

# Using 10-Fold cross validation
for train, test in cross_validation.KFold(len(df_attrs), n_folds=10):
    foldnum+=1
    [tr_data, te_data,
     tr_target, te_target] = folds_to_split(df_attrs,df_labels,train,test)
    gnb = GaussianNB()
    gnb.fit(tr_data, tr_target.values.ravel())
    prob_arr_gnb = gnb.predict_proba(te_data)
    score_gnb = gnb.score(te_data, te_target)
    fold_results.loc[foldnum, 'Score'] = score_gnb

print "Accuracy score accross folds : \n", fold_results   
# Get accuracy score accross folds    
print "\nMean accuracy score :", fold_results.mean()    

Accuracy score accross folds : 
       Score
1   0.930317
2   0.926697
3   0.937500
4   0.935688
5   0.916667
6   0.910326
7   0.929348
8   0.933877
9   0.941123
10  0.925725

Mean accuracy score : Score    0.928727
dtype: float64


Accuracy of Gaussian Naive Bayes classifier using 10-Fold cross validation : 0.928727