# Main.ipynb - Machine Learning 01

## Import packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
from sklearn import tree

## Import Data

In [10]:
# Gender: 0 = Male
music_data = pd.read_csv("music.csv")

music_data.head()

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz


## Create input and output data

In [7]:
# Clean Data:
# - Remove duplicates
# - Remove NA

# Separate the data into INPUT set and OUTPUT set
# X = INPUT
X = music_data.drop(columns = ["genre"])
X.head()

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1


In [8]:
# y = OUTPUT
y = music_data["genre"]
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

## Create the model and predictions

In [11]:
model01 = DecisionTreeClassifier()
model01.fit(X, y)
# Predict the music preferences of a 21 year old male (1), and a 22 year old woman (0)
predictions = model01.predict([
                                [21,1], 
                                [22,0]
                                ])

predictions

array(['HipHop', 'Dance'], dtype=object)

## Measure model accuracy

In [29]:
# Allocate to 70-80% of the inicial data to traing the model and the rest to test it
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model02 = DecisionTreeClassifier()
model02.fit(X_train, y_train)
# Predict the music preferences the 20% of the X_test set
predictions02 = model01.predict(X_test)
# Compare the predictions to the actual results (y_test)
score = accuracy_score(y_test, predictions02)
score


1.0

## Persisting Models

So as to not train the model again we can just import the trained model

In [33]:
joblib.dump(model02, "music-recommender.joblib")

['music-recommender.joblib']

In [34]:
model03 = joblib.load("music-recommender.joblib")

In [35]:
# Predict the music preferences of a 21 year old male (1), and a 22 year old woman (0)
predictions = model03.predict([
                                [21,1], 
                                [22,0]
                                ])

predictions

array(['HipHop', 'Dance'], dtype=object)

## Visualize the Decision Tree

In [38]:
# install dot extension to visualize the tree
tree.export_graphviz(model03, 
                     out_file="music-recommender.dot", 
                     feature_names=["age", "gender"],
                     class_names=sorted(y.unique()), 
                     label="all", # Each node has labels that we can read
                     rounded=True, # Each box has rounded corners
                     filled=True) # Each box is filled with color