# Importing data


In [3]:
import pandas as pd
music_data = pd.read_csv('music.csv')
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


# No cleaning required- no duplicates, but splitting data into 2 sets - input and output set

In [6]:
# output is target result= genre
# input - age and gender
# age=1 = male ; age=0 = female

X = music_data.drop(columns=['genre'])
Y = music_data['genre']
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [7]:
Y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

# Learning and predicting by DecisionTree

In [8]:
# import Decision tree

from sklearn.tree import DecisionTreeClassifier

In [None]:
# setting input and output
X = music_data.drop(columns=['genre'])
Y = music_data['genre']

In [10]:
# creating prediction model
model = DecisionTreeClassifier()
model.fit(X,Y)               #training by fit method

# making prediction , for 21 aged male and 22 aged female
predict1= model.predict([ [21,1], [22,0]])
predict1



array(['HipHop', 'Dance'], dtype=object)

In [11]:
# making prediction , for 40 aged male and 15 aged female
predict2= model.predict([ [40,1], [15,0]])
predict2



array(['Classical', 'Dance'], dtype=object)

# Measure Accuracy of model

In [25]:
# we split the data - into train and test sets 
# to measure accuracy we compare the actual output to output set
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# assume same input and output sets
X = music_data.drop(columns=['genre'])
Y = music_data['genre']
train_test_split(X,Y,test_size=0.2)   #20 percent of data will be sued as test, rest for training

#split method is a tuple so assigning it 4 variables

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)
predict3= model.predict(X_test)

# to calculate accuracy we will compare predict3 with Y_test
score = accuracy_score(Y_test, predict3)
score

0.5

In [None]:
# running accuracy again, will give diffrent values- as split takes random values to test and 
# train sets


# If we increase test sets more , accuracy is decreased!
# No. of training sets are directly proportional to accuracy of prediction model !

In [34]:


# we split the data - into train and test sets 
# to measure accuracy we compare the actual output to output set
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# assume same input and output sets
X = music_data.drop(columns=['genre'])
Y = music_data['genre']
train_test_split(X,Y,test_size=0.8)   #80 percent of data will be sued as test, rest for training

#split method is a tuple so assigning it 4 variables

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)
predict4= model.predict(X_test)

# to calculate accuracy we will compare predict3 with Y_test
score = accuracy_score(Y_test, predict4)
score

0.75

# Reusing prediction model - pre declaring the model by joblib

In [None]:
# storing data model is done to time save while training and retraining datasets
# old code 

from sklearn.tree import DecisionTreeClassifier

X = music_data.drop(columns=['genre'])
Y = music_data['genre']

# creating prediction model
model = DecisionTreeClassifier()
model.fit(X,Y)               #training by fit method

predict1= model.predict([ [21,1], [22,0]])

In [36]:
from sklearn.tree import DecisionTreeClassifier
import joblib

X = music_data.drop(columns=['genre'])
Y = music_data['genre']

# creating prediction model
model = DecisionTreeClassifier()
model.fit(X,Y)               #training by fit method

joblib.dump(model,'music-recommender-model.joblib')

['music-recommender-model.joblib']

In [None]:
# a joblib file is created at project folder- comment model code
# and load this joblib file


# X = music_data.drop(columns=['genre'])
# Y = music_data['genre']

# # creating prediction model
# model = DecisionTreeClassifier()
# model.fit(X,Y)  

model = joblib.load(model,'music-recommender-model.joblib')
predictions = model.predict([[21,1]])
predictions


# Forming decision tree in graphical format

from sklearn.tree import DecisionTreeClassifier
import tree

X = music_data.drop(columns=['genre'])
Y = music_data['genre']
model = DecisionTreeClassifier()
model.fit(X,Y)   

tree.export_graphviz(model, out_file='music-rec.dot', 
feature_names=['age','gender'],
class_names=sorted(y.unique()),
label='all', 
rounded=True, 
filled=True)

In [40]:
# a dot file is created in project folder, open in VS studio with graphiv extension