In [2]:
# The Problem,
# We want to develop a model that takes previous user data and suggests the genre of music the user will like 
# based on their age and gender.

In [3]:
import pandas as pd
# load data
music_data = pd.read_csv('../ML/files/music.csv')
# Check basic stat of data
music_data.describe()


Unnamed: 0,age,gender
count,2304.0,2304.0
mean,27.944444,0.5
std,4.984078,0.500109
min,20.0,0.0
25%,25.0,0.0
50%,28.0,0.5
75%,31.0,1.0
max,37.0,1.0


In [4]:
# Define X and y where y is the desired output i.e the predictions we want to make from the available data.
# X is/are the parameters we want to use to get our predictions
X = music_data.drop(columns=['genre']) #This creates a new dataframe without the genre column
y = music_data['genre']


In [5]:
# To make sure our model is well trained and accurate, we need to split our data set into test & train sets
# To do this we need to import the function from sklearn model_selection
from sklearn.model_selection import train_test_split
# The function returns a turple so we can just unpack it directly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
y_test

1346     Acoustic
351         Dance
2263     Acoustic
1566       HipHop
1932    Classical
          ...    
299         Dance
255          Jazz
1355         Jazz
2144       HipHop
1810        Dance
Name: genre, Length: 692, dtype: object

In [6]:
# import model to use - here we use decision tree from scikitlearn
from sklearn.tree import DecisionTreeClassifier

# set model to a variable
model = DecisionTreeClassifier()
# fit model i.e perform analysis. We will pass the two training sets to train the model.
model.fit(X_train, y_train)

# make predictions i.e test model
predictions = model.predict(X_test)
predictions

array(['Acoustic', 'Dance', 'Acoustic', 'HipHop', 'Classical', 'HipHop',
       'Classical', 'Dance', 'Acoustic', 'HipHop', 'Classical', 'Jazz',
       'Classical', 'Acoustic', 'Dance', 'Dance', 'Classical',
       'Classical', 'Classical', 'Classical', 'Classical', 'Acoustic',
       'Acoustic', 'Acoustic', 'Classical', 'Classical', 'Dance',
       'Acoustic', 'Classical', 'Dance', 'Acoustic', 'HipHop', 'Dance',
       'Jazz', 'Acoustic', 'Jazz', 'Dance', 'Classical', 'Classical',
       'Acoustic', 'HipHop', 'Jazz', 'Dance', 'Jazz', 'Acoustic',
       'Acoustic', 'Jazz', 'Jazz', 'HipHop', 'HipHop', 'Classical',
       'HipHop', 'Jazz', 'Acoustic', 'HipHop', 'Acoustic', 'HipHop',
       'Dance', 'Acoustic', 'Jazz', 'Classical', 'Acoustic', 'Classical',
       'HipHop', 'Classical', 'Jazz', 'Classical', 'Acoustic', 'HipHop',
       'Jazz', 'Classical', 'Classical', 'Acoustic', 'Acoustic', 'Jazz',
       'Jazz', 'Jazz', 'Classical', 'Dance', 'Dance', 'Dance', 'Acoustic',
       'HipHop'

In [7]:
# Evaluate model to know the accuracy
# Import the function to access the accuracy from sklearn
from sklearn.metrics import accuracy_score
#This is done by passing the y_test and the predictions to confirm how accurate our model is
score = accuracy_score(y_test, predictions)
score

1.0

In [10]:
# Now I can decide to save already trained model so I don't have to retrain the model all the time 
# This will definitely save computational time.
# We use the function below for this.
import joblib
# This function has two helpers - dump(this saves the model in a file) and load(this loads the model from the file)
# joblib.dump(model, '../ML/files/musicRecommender.joblib') - We comment this out cos the model has now been saved
# Now load the already trained model from joblib and use
model = joblib.load('../ML/files/musicRecommender.joblib')
# Just make predictions now i.e test model
predictions = model.predict(X_test)
predictions

array(['Acoustic', 'Dance', 'Acoustic', 'HipHop', 'Classical', 'HipHop',
       'Classical', 'Dance', 'Acoustic', 'HipHop', 'Classical', 'Jazz',
       'Classical', 'Acoustic', 'Dance', 'Dance', 'Classical',
       'Classical', 'Classical', 'Classical', 'Classical', 'Acoustic',
       'Acoustic', 'Acoustic', 'Classical', 'Classical', 'Dance',
       'Acoustic', 'Classical', 'Dance', 'Acoustic', 'HipHop', 'Dance',
       'Jazz', 'Acoustic', 'Jazz', 'Dance', 'Classical', 'Classical',
       'Acoustic', 'HipHop', 'Jazz', 'Dance', 'Jazz', 'Acoustic',
       'Acoustic', 'Jazz', 'Jazz', 'HipHop', 'HipHop', 'Classical',
       'HipHop', 'Jazz', 'Acoustic', 'HipHop', 'Acoustic', 'HipHop',
       'Dance', 'Acoustic', 'Jazz', 'Classical', 'Acoustic', 'Classical',
       'HipHop', 'Classical', 'Jazz', 'Classical', 'Acoustic', 'HipHop',
       'Jazz', 'Classical', 'Classical', 'Acoustic', 'Acoustic', 'Jazz',
       'Jazz', 'Jazz', 'Classical', 'Dance', 'Dance', 'Dance', 'Acoustic',
       'HipHop'

In [11]:
# Test again
score = accuracy_score(y_test, predictions)
score

1.0

In [15]:
# We can get a graphical representation of the behind the scenes of how the model solves the problem
from sklearn import tree
tree.export_graphviz(model, out_file='../ML/files/musicRecommender.dot', 
                     feature_names=['age', 'gender'], 
                     filled=True, label='all', rounded=True,
                     class_names=sorted(y.unique()))
# Now I will open it in vscode using a pluggin to display