In [16]:
import pandas as pd
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.cross_validation import KFold, cross_val_score

dataset = pd.read_csv('data/master.csv')
dataset = dataset.dropna()

#prepare datasets to be fed into the naive bayes model
#Predict mental health category given dance and energy

CV = dataset.health_categorical.values.reshape(len(dataset.health_categorical),1)
data = (dataset.ix[:,['energy','dance','popularity','tempo']].values).reshape(len(dataset.dance),4)
print(data)

#Create the model object
NB = GaussianNB()

#Train the model using the training sets
NB.fit(data,CV.ravel())

#Model
print("Probability of the classes: ", NB.class_prior_)
print("Mean of each feature per class: \n", NB.theta_)
print("Variance of each feature per class: \n", NB.sigma_)

#predict the class for each data point
predicted = NB.predict(data)
print("Predictions:\n",np.array([predicted]).T)

# predict the probability/likelihood of the prediction
prob_of_pred = NB.predict_proba(data)
print("Probability of each class for the prediction: \n",prob_of_pred)

print("Accuracy of the model: ",NB.score(data,CV.ravel()))

print("The confusion matrix:\n", metrics.confusion_matrix(CV.ravel(), predicted, [0,1]))

# Calculating 5 fold cross validation results
model = GaussianNB()
kf = KFold(len(CV.ravel()), n_folds=10)
scores = cross_val_score(model, data, CV.ravel(), cv=kf)
print("MSE of every fold in 10	 fold cross validation: ", abs(scores))
print("Mean of the 10 fold cross-validation: %0.2f" % abs(scores.mean()))

print("Does he have a high mental health, if he listens to 0.80 danceability and energy ", NB.predict([0.80,0.80,88,160]),NB.predict_proba([0.73,0.65,88,160]))

[[6.33000000e-01 6.55000000e-01 7.15000000e+01 9.83680000e+01]
 [6.55000000e-01 5.26000000e-01 6.50000000e+01 1.21127000e+02]
 [7.45500000e-01 7.42000000e-01 6.35000000e+01 1.25009000e+02]
 [7.39000000e-01 6.81666667e-01 6.03333333e+01 1.13576000e+02]
 [5.39000000e-01 5.13000000e-01 3.00000000e+00 1.22460500e+02]
 [7.40666667e-01 5.11666667e-01 4.36666667e+01 1.49958000e+02]
 [6.41000000e-01 6.92000000e-01 8.20000000e+01 1.37599333e+02]
 [4.96000000e-01 6.47000000e-01 7.25000000e+01 1.15527500e+02]
 [5.49333333e-01 7.13333333e-01 5.86666667e+01 1.11394333e+02]
 [5.58000000e-01 5.15000000e-01 7.40000000e+01 1.52126333e+02]
 [5.84333333e-01 7.40666667e-01 7.00000000e+01 1.02034667e+02]
 [3.32000000e-01 2.61500000e-01 4.80000000e+01 1.61619500e+02]
 [7.89333333e-01 5.00333333e-01 4.80000000e+01 1.31532667e+02]
 [6.77333333e-01 6.55333333e-01 7.66666667e+01 1.05389000e+02]
 [6.77500000e-01 8.12500000e-01 5.65000000e+01 1.11560500e+02]
 [1.43000000e-01 1.75000000e-01 5.20000000e+01 1.733300

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


ValueError: Expected 2D array, got 1D array instead:
array=[  0.8   0.8  88.  160. ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.