# **Decision Tree Model (By Nourhan Adel)**

### **Import libraries**

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### **Load and inspect data**

In [2]:
pwd = os.getcwd()
data_path = os.path.join(pwd,'/content/Final_Data.csv')
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,Artist,Title,Label,song_path,new_song_path,extracted_chorus_path,chroma_stft_kew_0,chroma_stft_min_0,chroma_stft_max_0,chroma_stft_std_0,...,tonnetz_mean_5,tonnetz_median_5,tonnetz_kurtosis_5,zero_crossing_rate_kew_0,zero_crossing_rate_min_0,zero_crossing_rate_max_0,zero_crossing_rate_std_0,zero_crossing_rate_mean_0,zero_crossing_rate_median_0,zero_crossing_rate_kurtosis_0
0,The Weeknd,Blinding Lights,1,popular/Blinding Lights.mp3,/content/drive/MyDrive/bill/popular/Blinding L...,chorus_extract/Blinding Lights.wav,0.476192,0.016697,1.0,0.290165,...,0.007436,0.000934,-0.539747,1.466814,0.037551,0.159229,0.020931,0.082646,0.079751,3.242503
1,Olivia Rodrigo,Good 4 U,1,popular/Good 4 U.mp3,/content/drive/MyDrive/bill/popular/Good 4 U.mp3,chorus_extract/Good 4 U.wav,1.048281,0.006909,1.0,0.27469,...,-0.01038,-0.011803,0.115312,-0.398137,0.054286,0.224626,0.035255,0.147474,0.157392,-0.313862
2,Olivia Rodrigo,Drivers License,1,popular/Drivers License.mp3,/content/drive/MyDrive/bill/popular/Drivers Li...,chorus_extract/Drivers License.wav,1.391406,0.0007,1.0,0.307762,...,-0.002966,-0.011493,0.48648,0.746785,0.02263,0.176054,0.031968,0.076661,0.065692,-0.147254
3,Lil Nas X,Montero (Call Me By Your Name),1,popular/Montero (Call Me By Your Name).mp3,/content/drive/MyDrive/bill/popular/Montero (C...,chorus_extract/Montero (Call Me By Your Name).wav,0.898662,0.002983,1.0,0.239849,...,-0.013886,-0.021088,0.467516,0.528719,0.048934,0.266757,0.043461,0.143693,0.137687,-0.347516
4,BTS,Butter,1,popular/Butter.mp3,/content/drive/MyDrive/bill/popular/Butter.mp3,chorus_extract/Butter.wav,0.299845,0.036609,1.0,0.33476,...,0.00686,0.004859,-0.384372,2.536822,0.040181,0.221769,0.027223,0.095766,0.090703,7.626171


In [3]:
len(data.keys())

524

In [4]:
label = data['Label']

data = data.drop(['Artist',	'Title', 'song_path',	'new_song_path',	'extracted_chorus_path', 'Label'], axis=1)

In [5]:
len(data.keys())

518

In [6]:
label[:5]

0    1
1    1
2    1
3    1
4    1
Name: Label, dtype: int64

## **Use Correlation to drop the high correlated columns**
Correlation is used to summarize the strength and direction of the linear association between two quantitative variables. It is denoted by r and values between -1 and +1. A positive value for r indicates a positive association, and a negative value for r indicates a negative association.

In [7]:
cor_matrix = data.corr().abs()
print(cor_matrix)

                               chroma_stft_kew_0  chroma_stft_min_0  \
chroma_stft_kew_0                       1.000000           0.371227   
chroma_stft_min_0                       0.371227           1.000000   
chroma_stft_max_0                       0.007775           0.017458   
chroma_stft_std_0                       0.608770           0.081623   
chroma_stft_mean_0                      0.957253           0.495092   
...                                          ...                ...   
zero_crossing_rate_max_0                0.079045           0.261665   
zero_crossing_rate_std_0                0.045875           0.274687   
zero_crossing_rate_mean_0               0.227831           0.151675   
zero_crossing_rate_median_0             0.243171           0.129835   
zero_crossing_rate_kurtosis_0           0.023809           0.023571   

                               chroma_stft_max_0  chroma_stft_std_0  \
chroma_stft_kew_0                       0.007775           0.608770   
chrom

In [8]:
from sklearn.feature_selection import VarianceThreshold

constant_filter = VarianceThreshold(threshold=0.85)
constant_filter.fit(data)


VarianceThreshold(threshold=0.85)

In [9]:
len(data.columns[constant_filter.get_support()])


198

In [10]:
constant_columns = [column for column in data.columns
                    if column not in data.columns[constant_filter.get_support()]]

print(len(constant_columns))

320


In [11]:
constant_columns 

['chroma_stft_kew_0',
 'chroma_stft_min_0',
 'chroma_stft_max_0',
 'chroma_stft_std_0',
 'chroma_stft_mean_0',
 'chroma_stft_median_0',
 'chroma_stft_kew_1',
 'chroma_stft_min_1',
 'chroma_stft_max_1',
 'chroma_stft_std_1',
 'chroma_stft_mean_1',
 'chroma_stft_median_1',
 'chroma_stft_kew_2',
 'chroma_stft_min_2',
 'chroma_stft_max_2',
 'chroma_stft_std_2',
 'chroma_stft_mean_2',
 'chroma_stft_median_2',
 'chroma_stft_kew_3',
 'chroma_stft_min_3',
 'chroma_stft_max_3',
 'chroma_stft_std_3',
 'chroma_stft_mean_3',
 'chroma_stft_median_3',
 'chroma_stft_kew_4',
 'chroma_stft_min_4',
 'chroma_stft_max_4',
 'chroma_stft_std_4',
 'chroma_stft_mean_4',
 'chroma_stft_median_4',
 'chroma_stft_kew_5',
 'chroma_stft_min_5',
 'chroma_stft_max_5',
 'chroma_stft_std_5',
 'chroma_stft_mean_5',
 'chroma_stft_median_5',
 'chroma_stft_kew_6',
 'chroma_stft_min_6',
 'chroma_stft_max_6',
 'chroma_stft_std_6',
 'chroma_stft_mean_6',
 'chroma_stft_median_6',
 'chroma_stft_kew_7',
 'chroma_stft_min_7',
 'ch

In [12]:
data = constant_filter.transform(data)

In [13]:
data.shape

(792, 198)

In [14]:
df = pd.DataFrame(data, label)
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.920438,-0.274114,-0.288602,-1.193596,-0.024066,-1.266645,-0.083848,-1.082290,-0.932199,-0.028455,...,4992.503144,4920.336914,-0.374785,0.891238,-1.030463,-0.192172,0.384000,-0.415851,-0.539747,3.242503
1,0.136358,-1.481371,0.517289,1.347993,-0.115323,2.365486,0.637198,1.723358,-0.433868,-0.010923,...,4483.606222,4220.507812,4.208054,1.117581,1.105892,0.077508,0.209567,0.366745,0.115312,-0.313862
1,0.718688,2.729941,0.539202,3.406664,1.738767,0.945224,2.564557,-1.387718,1.883393,0.337301,...,3647.294612,3192.297363,-0.678697,0.573413,0.063356,-0.684246,0.154728,-0.058632,0.486480,-0.147254
1,0.231711,-0.764905,-0.171808,-1.244604,0.048819,1.684652,2.159814,0.469628,-1.120641,0.548145,...,7308.172475,7983.435059,0.597110,0.014670,0.694812,1.412927,0.425779,0.770083,0.467516,-0.347516
1,-1.433215,-0.083812,-0.129989,-0.817375,-0.144148,-0.874035,0.279901,-0.283576,-1.285667,0.210506,...,5903.980966,5840.881348,-0.030101,-0.280648,0.296000,-0.779434,0.924981,0.940209,-0.384372,7.626171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-1.332615,-1.220260,-0.399320,-0.858080,-0.937122,-1.112958,-0.459458,-0.268156,-1.127672,-0.760497,...,6971.757843,7251.306152,0.513016,0.200870,0.211364,0.323342,2.472084,-0.010748,0.813091,2.920797
0,-1.401194,1.033810,-0.622381,1.776094,-0.923331,0.616756,1.026678,-0.321422,0.773214,0.180635,...,6460.210936,6718.359375,-1.239889,-0.051348,0.891065,-0.326609,0.378586,2.335101,0.258094,3.043618
0,-0.895876,0.282211,-0.544669,-1.268309,-0.699188,-0.872076,-0.986965,-1.351679,-1.046322,-1.018566,...,5085.469249,4936.486816,-0.484211,-0.238451,2.233149,0.874422,1.107583,1.169321,0.248444,0.049509
0,-1.182581,-1.069113,-0.512044,-0.711488,-0.935645,-1.379811,-1.406859,-1.255797,-1.201236,-1.092943,...,3591.861614,3509.912109,0.682838,1.518193,1.842552,2.092778,0.407291,0.826455,0.905429,0.431357


## **Scale the data**

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(data)

StandardScaler()

In [16]:

scaled_data = scaler.transform(data)
scaled_data

array([[-0.59064096, -0.29371171, -0.19862848, ..., -0.58740734,
        -0.71419751,  1.69370007],
       [ 0.07705694, -1.07937212,  0.35176783, ..., -0.16680047,
        -0.30780743, -0.59091472],
       [ 0.44498026,  1.6612717 ,  0.36673371, ..., -0.39541968,
        -0.0775389 , -0.4838852 ],
       ...,
       [-0.57512201,  0.06833454, -0.37351296, ...,  0.26454457,
        -0.22521364, -0.35748458],
       [-0.75626582, -0.81108245, -0.35123149, ...,  0.08027105,
         0.18237145, -0.11218433],
       [ 1.26201845,  0.96838545, -0.24001526, ...,  1.45109481,
        -0.71859942,  0.0281076 ]])

In [17]:
scaled_data.shape

(792, 198)

## **Decision Tree Model**

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_Validation, Y_train, Y_Validation= train_test_split(scaled_data, label, test_size= 0.2)

In [19]:
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,Y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_Validation)

In [20]:
# Calculate the mean absolute error for the model
from sklearn.metrics import mean_absolute_error

DT_Validation_MSE = mean_absolute_error(y_pred, Y_Validation)
print(f'Mean Square Error for Decision Tree Model is: {DT_Validation_MSE}')

Mean Square Error for Decision Tree Model is: 0.4716981132075472


## So, the accuracy is approximately 53 %

# **Hypertunning th the data**

In [31]:
pwd = os.getcwd()
data_path = os.path.join(pwd,'/content/Final_Data.csv')
data2 = pd.read_csv(data_path)
data2.head()

Unnamed: 0,Artist,Title,Label,song_path,new_song_path,extracted_chorus_path,chroma_stft_kew_0,chroma_stft_min_0,chroma_stft_max_0,chroma_stft_std_0,...,tonnetz_mean_5,tonnetz_median_5,tonnetz_kurtosis_5,zero_crossing_rate_kew_0,zero_crossing_rate_min_0,zero_crossing_rate_max_0,zero_crossing_rate_std_0,zero_crossing_rate_mean_0,zero_crossing_rate_median_0,zero_crossing_rate_kurtosis_0
0,The Weeknd,Blinding Lights,1,popular/Blinding Lights.mp3,/content/drive/MyDrive/bill/popular/Blinding L...,chorus_extract/Blinding Lights.wav,0.476192,0.016697,1.0,0.290165,...,0.007436,0.000934,-0.539747,1.466814,0.037551,0.159229,0.020931,0.082646,0.079751,3.242503
1,Olivia Rodrigo,Good 4 U,1,popular/Good 4 U.mp3,/content/drive/MyDrive/bill/popular/Good 4 U.mp3,chorus_extract/Good 4 U.wav,1.048281,0.006909,1.0,0.27469,...,-0.01038,-0.011803,0.115312,-0.398137,0.054286,0.224626,0.035255,0.147474,0.157392,-0.313862
2,Olivia Rodrigo,Drivers License,1,popular/Drivers License.mp3,/content/drive/MyDrive/bill/popular/Drivers Li...,chorus_extract/Drivers License.wav,1.391406,0.0007,1.0,0.307762,...,-0.002966,-0.011493,0.48648,0.746785,0.02263,0.176054,0.031968,0.076661,0.065692,-0.147254
3,Lil Nas X,Montero (Call Me By Your Name),1,popular/Montero (Call Me By Your Name).mp3,/content/drive/MyDrive/bill/popular/Montero (C...,chorus_extract/Montero (Call Me By Your Name).wav,0.898662,0.002983,1.0,0.239849,...,-0.013886,-0.021088,0.467516,0.528719,0.048934,0.266757,0.043461,0.143693,0.137687,-0.347516
4,BTS,Butter,1,popular/Butter.mp3,/content/drive/MyDrive/bill/popular/Butter.mp3,chorus_extract/Butter.wav,0.299845,0.036609,1.0,0.33476,...,0.00686,0.004859,-0.384372,2.536822,0.040181,0.221769,0.027223,0.095766,0.090703,7.626171


In [32]:
label2 = data2['Label']

data2 = data2.drop(['Artist',	'Title', 'song_path',	'new_song_path',	'extracted_chorus_path', 'Label'], axis=1)

In [33]:
cor_matrix = data2.corr().abs()
print(cor_matrix)

                               chroma_stft_kew_0  chroma_stft_min_0  \
chroma_stft_kew_0                       1.000000           0.371227   
chroma_stft_min_0                       0.371227           1.000000   
chroma_stft_max_0                       0.007775           0.017458   
chroma_stft_std_0                       0.608770           0.081623   
chroma_stft_mean_0                      0.957253           0.495092   
...                                          ...                ...   
zero_crossing_rate_max_0                0.079045           0.261665   
zero_crossing_rate_std_0                0.045875           0.274687   
zero_crossing_rate_mean_0               0.227831           0.151675   
zero_crossing_rate_median_0             0.243171           0.129835   
zero_crossing_rate_kurtosis_0           0.023809           0.023571   

                               chroma_stft_max_0  chroma_stft_std_0  \
chroma_stft_kew_0                       0.007775           0.608770   
chrom

In [34]:
from sklearn.feature_selection import VarianceThreshold

constant_filter2 = VarianceThreshold(threshold=0.6)
constant_filter2.fit(data2)


VarianceThreshold(threshold=0.6)

In [35]:
print(f'Length of new data after filtering: {len(data2.columns[constant_filter2.get_support()])}')

constant_columns = [column for column in data2.columns
                    if column not in data2.columns[constant_filter2.get_support()]]

print(f'Length of constant columns: {len(constant_columns)}')

Length of new data after filtering: 209
Length of constant columns: 309


In [36]:
data2 = constant_filter2.transform(data2)
data2.shape

(792, 209)

In [37]:
from sklearn.preprocessing import StandardScaler

scaler2 = StandardScaler()
scaler2.fit(data2)

scaled_data2 = scaler2.transform(data2)
scaled_data2

array([[-0.59064096, -0.29371171, -0.19862848, ..., -0.58740734,
        -0.71419751,  1.69370007],
       [ 0.07705694, -1.07937212,  0.35176783, ..., -0.16680047,
        -0.30780743, -0.59091472],
       [ 0.44498026,  1.6612717 ,  0.36673371, ..., -0.39541968,
        -0.0775389 , -0.4838852 ],
       ...,
       [-0.57512201,  0.06833454, -0.37351296, ...,  0.26454457,
        -0.22521364, -0.35748458],
       [-0.75626582, -0.81108245, -0.35123149, ...,  0.08027105,
         0.18237145, -0.11218433],
       [ 1.26201845,  0.96838545, -0.24001526, ...,  1.45109481,
        -0.71859942,  0.0281076 ]])

In [38]:
from sklearn.model_selection import train_test_split

X_train2, X_Validation2, Y_train2, Y_Validation2= train_test_split(scaled_data2, label2, test_size= 0.2)

In [39]:
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree classifer object
clf2 = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf2 = clf.fit(X_train2,Y_train2)

#Predict the response for test dataset
y_pred2 = clf2.predict(X_Validation2)

In [40]:
# Calculate the mean absolute error for the model
from sklearn.metrics import mean_absolute_error

DT_Validation_MSE2 = mean_absolute_error(y_pred2, Y_Validation2)
print(f'Mean Square Error (2) for Decision Tree Model is: {DT_Validation_MSE2}')

Mean Square Error (2) for Decision Tree Model is: 0.4276729559748428


## So, the accuracy is approximately 58 %