In [19]:
import pandas as pd

data = pd.read_csv("music.csv")
data

# gender = 1 is male, gender = 0 if female

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


### Objective: I want to predict what genre of music a 21 year old male likes ###

In [20]:
x = data.drop(columns=['genre']) #recreates the data set without the column genre
x


Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [21]:
y = data['genre'] #prints only the genre column of the dataframe
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

In [30]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier() # this creates the model
model.fit(x.values, y) # x is input set, y is output set. Training set

# For a 21 year old male and for a 22 year old female, the preferred genres are: ____, and ____.

prediction = model.predict([ [21, 1], [22, 0] ]) # .predict function takes in a 2D array. So inner [ is input]

prediction

array(['HipHop', 'Dance'], dtype=object)

### Accuracy: how to calculate it so you know how reliable your model is? ###

In [42]:
from sklearn.model_selection import train_test_split

# With this module, you can easily split your data between training, and testing.

from sklearn.metrics import accuracy_score

# With this module, you can compare the predictions with the actual results in the y_test testing data set.

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.1) 
# this confirms the you split the data between 80& for training and 20% for testing. 
# Function returns a tuple. So unpack it into the 4 variables for training/input sets and testing/outputs sets.

improved_model = model.fit(x_train, y_train) # passin gonly training data set
improved_prediction = model.predict(x_test)

score = accuracy_score(y_test, improved_prediction) # returns accuracy score between 0 to 1.

In [43]:
improved_model
improved_prediction
score

# Score returned as 1 = 100% accuracy, perfect! 
# #Increasing the test_size will hurt the accuracy pf the model as less data will be used for training.

1.0

In [46]:
import joblib
#this allows saving and loading models

joblib.dump(model, "music.joblib")
# file stored in the folder

['music.joblib']

## TESTING & VISUALSING ##
#### Now to test joblib module and load the model.
#### Go to file "Load_music_model" ####
#### To visualise the model, keep reading below 👇: ####

In [50]:
from sklearn import tree
# this allows to export the decision tree in a graphical format

tree.export_graphviz(
    model, out_file= "music_model.dot", feature_names = ['age', 'gender'], class_names = sorted(y.unique()), label = 'all', rounded = True, filled = True
    )

# This exports the model in graphical format to a .dot file
# To view it, download extension: dot (graphviz)
