# Import all basic libraries

First, we will import libraries that we need to start our workflow. The libraries we are using are:

    NumPy
    Pandas
    Matplotlib
    seaborn


In [1]:
import pandas as pd 
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import StandardScaler,RobustScaler

# from pycaret.classification import *

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier,RidgeClassifier,Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import  DecisionTreeClassifier


from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier,HistGradientBoostingClassifier,StackingClassifier

from sklearn.experimental import enable_hist_gradient_boosting
from xgboost import XGBClassifier ,plot_importance
from lightgbm import LGBMClassifier


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils.np_utils import to_categorical

# Loading the Data

In [2]:
train=pd.read_csv('../input/music-project-genre/train (1).csv')
test=pd.read_csv('../input/music-project-genre/test (2).csv')

# EDA (Exploratory Data Analysis)
For making a proper undertanding of dataset we are using, we will perform a bief EDA (Exploratory Data Analysis). The EDA is sub-divided into:

    Data Visuals
    Data Understanding
    Data Analysis

In [3]:
train.head()

In [4]:
test.head()

In [None]:
train.shape

In [None]:
print('Number of Rows (Samples): %s' % str((train.shape[0])))
print('Number of Columns (Features): %s' % str((train.shape[1])))

## features

* artist: Name of the Artist.
* song: Name of the Track.
* popularity: The higher the value the more popular the song is.
* danceability: Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm
* energy: Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity.
* key: The key the track is in. Integers map to pitches using standard Pitch Class notation. E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on..
* loudness: The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative
* mode: Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0.
* speechiness: Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks.

* acousticness: A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic.

* instrumentalness: Predicts whether a track contains no vocals. "Ooh" and "aah" sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly "vocal". The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.

* liveness: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live.

* valence: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

* tempo: The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.

* duration in milliseconds :Time of the song

* time_signature : a notational convention used in Western musical notation to specify how many beats (pulses) are contained in each measure (bar), and which note value is equivalent to a beat.

* Class: Genre of the track.

In [5]:
train.info()

The following information tells us that:

    We have a data, containing 14396 entries (samples)
    There are a total of 18 columns belinging to each sample
    There are a type to each column, have two object
    There are missing values in our data, which need to be filled or dropped for proper modelling (Popularity,key,instrumentalness )
    The memory consumption of data is 2.0 MB

In [8]:
train.describe().T

In [7]:
train.nunique()

## check if any null values in data and any duplicated data

In [9]:
train.isna().sum()

In [None]:
test.isna().sum()

In [10]:
train.duplicated().sum()

In [None]:
test.duplicated().sum()

In [16]:
train["Artist Name"].unique()

In [17]:
train["Track Name"].unique()

# Data Visualizations
After getting some useful information about our data, we now make visuals of our data to see how the trend in our data goes like. The visuals include bar plots, distribution plots, scatter plots, etc.

In [11]:
data_to_viz = train
enc_1 = LabelEncoder()
list_r = []
list_r.extend(train['Track Name'])
list_r.extend(test['Track Name'])
enc_1.fit(list_r)
data_to_viz['Track Name NUM'] = enc_1.transform(train['Track Name'].values.reshape(-1,1))

list_r = []
list_r.extend(train['Artist Name'])
list_r.extend(test['Artist Name'])
TN = pd.DataFrame(list_r,columns=["Artist Name"])
a = TN["Artist Name"].value_counts().sort_values(ascending = True).index
enc_2 = OrdinalEncoder(categories=[a],dtype=int)
enc_2.fit(a.values.reshape(-1,1))
data_to_viz['Artist Name NUM'] = enc_2.transform(train['Artist Name'].values.reshape(-1,1))

A histogram is a chart that plots the distribution of a numeric variable’s values as a series of bars. Each bar typically covers a range of numeric values called a bin or class; a bar’s height indicates the frequency of data points with a value within the corresponding bin.

In [12]:
data_to_viz.hist( figsize=(30,20))

In [None]:
sns.pairplot(data_to_viz , diag_kind = 'kde')

In [None]:
# sns.factorplot(data=data_to_viz.drop(["Id",'duration_in min/ms'],axis=1) , kind='box' , size=7, aspect=2.5)

In [None]:
# num_cols = data_to_viz.keys()
# for i in num_cols:
#     if data_to_viz[i].dtypes !="object":
#         plt.figure(figsize=(7,4))
#         sns.boxplot(x=data_to_viz[i],palette="Set2");
#         plt.show()

# Visualization every feature

Histogram  && Density chart for numeric column

## Popularity

In [20]:
sns.distplot( a=data_to_viz["Popularity"], hist=True, kde=False, rug=False )

In [21]:
sns.kdeplot(data_to_viz["Popularity"], shade=True , color='r')

In [22]:
sns.jointplot(x=data_to_viz["Class"], y=data_to_viz["Popularity"], kind='scatter')

In [None]:
# sns.factorplot(x='popularity', y='Class', data=train, kind='box' ,aspect=2.5 )

## danceability


In [23]:
sns.distplot( a=data_to_viz["danceability"], hist=True, kde=False, rug=False )

In [24]:
sns.kdeplot(data_to_viz["danceability"], shade=True , color='r')

In [26]:
sns.jointplot(x=data_to_viz["Class"], y=data_to_viz["danceability"], kind='scatter')

## energy


In [27]:
sns.distplot( a=data_to_viz["energy"], hist=True, kde=False, rug=False )

In [28]:
sns.kdeplot(data_to_viz["energy"], shade=True , color='r')

In [29]:
sns.jointplot(x=data_to_viz["Class"], y=data_to_viz["energy"], kind='scatter')

## key

In [30]:
sns.distplot( a=data_to_viz["key"], hist=True, kde=False, rug=False )

In [31]:
sns.kdeplot(data_to_viz["key"], shade=True , color='r')

In [32]:
sns.jointplot(x=data_to_viz["Class"], y=data_to_viz["key"], kind='scatter')

In [None]:
# plot
# plt.plot('Class', 'key', data=data_to_viz, linestyle='-', marker='o')
# plt.show()

In [None]:
# sns.countplot(x="key", data=train)


## mode

In [None]:
# sns.countplot(x="mode", data=train)


In [None]:
# train_cor['key'].isna().sum()

In [None]:
# train_cor[['key',"mode"]]

Top 10 popular songs

In [None]:
# plt.figure(figsize=(40, 15))
# sns.set(style="whitegrid")

# # group by the song's name and rank them base on their popularity
# x = train.groupby("Track Name")["Popularity"].mean().sort_values(ascending=False).head(10)
# axis = sns.barplot(x=x.index, y=x)

# axis.set_ylabel('Popularity', fontsize=40)
# axis.set_xlabel('song title', fontsize=40)

# corr

In [None]:
# train.corr()

In [None]:
# plt.figure(figsize = (20,20))
# sns.heatmap(train.corr(),annot = True , cmap = 'coolwarm' );

In [None]:
# corr_matrix = train.corr()
# corr_matrix["Class"].abs().sort_values(ascending=False)

In [None]:
# len(train['Artist Name'].unique())

In [None]:
# len(train['Track Name'].unique())

In [None]:
# train[['Track Name','Artist Name']].describe()

In [None]:
# a = train["Artist Name"].value_counts().sort_values(ascending = True).index.values
# a

In [None]:
# a.index

In [None]:
# best_artists=train.groupby(["Artist Name"]).mean()
# best_artists=best_artists.sort_values(by=["Popularity"], ascending=False)[:20]
# best_artists["Popularity"].plot(kind='barh', figsize=(15,10))

In [None]:
# best_artists=train.groupby(["Artist Name"]).mean()
# best_artists=best_artists.sort_values(by=["Class"], ascending=False)[:20]
# best_artists["Class"].plot(kind='barh', figsize=(15,10))

# Data Preprocessing
Data preprocessing plays an important part in the process of data science, since data may not be fully clean and can contain missing or null values. In this step, we are undergoing some preprocessing steps that will help us if there is any null or missing value in our data.

In [None]:
# train_cor = train

In [None]:
# Q1 = train_cor["Popularity"].quantile(0.25)

# Q3 = train_cor["Popularity"].quantile(0.75)

# IQR = Q3 - Q1

# Lower_Fence = Q1 - (1.5 * IQR)
# print(Lower_Fence)
# Upper_Fence = Q3 + (1.5 * IQR)
# print(Upper_Fence)
# print((train_cor['Popularity'] < Lower_Fence).sum())
# print((train_cor['Popularity'] >= Upper_Fence).sum())

In [None]:
# Q1 = train_cor["danceability"].quantile(0.25)

# Q3 = train_cor["danceability"].quantile(0.75)

# IQR = Q3 - Q1

# Lower_Fence = Q1 - (1.5 * IQR)
# print(Lower_Fence)
# Upper_Fence = Q3 + (1.5 * IQR)
# print(Upper_Fence)
# print((train_cor['danceability'] < Lower_Fence).sum())
# print((train_cor['danceability'] >= Upper_Fence).sum())

In [None]:

# Q1 = train_cor["loudness"].quantile(0.25)

# Q3 = train_cor["loudness"].quantile(0.75)

# IQR = Q3 - Q1

# Lower_Fence = Q1 - (1.5 * IQR)
# print(Lower_Fence)
# Upper_Fence = Q3 + (1.5 * IQR)
# print(Upper_Fence)
# print((train_cor['loudness'] < Lower_Fence).sum())
# print((train_cor['loudness'] >= Upper_Fence).sum())

In [None]:

# Q1 = train_cor["speechiness"].quantile(0.25)

# Q3 = train_cor["speechiness"].quantile(0.75)

# IQR = Q3 - Q1

# Lower_Fence = Q1 - (1.5 * IQR)
# print(Lower_Fence)
# Upper_Fence = Q3 + (1.5 * IQR)
# print(Upper_Fence)
# print((train_cor['speechiness'] < Lower_Fence).sum())
# print((train_cor['speechiness'] >= Upper_Fence).sum())

In [None]:
# Q1 = train_cor["instrumentalness"].quantile(0.25)

# Q3 = train_cor["instrumentalness"].quantile(0.75)

# IQR = Q3 - Q1

# Lower_Fence = Q1 - (1.5 * IQR)
# print(Lower_Fence)
# Upper_Fence = Q3 + (1.5 * IQR)
# print(Upper_Fence)
# print((train_cor['instrumentalness'] < Lower_Fence).sum())
# print((train_cor['instrumentalness'] >= Upper_Fence).sum())

In [None]:
# Q1 = train_cor["liveness"].quantile(0.25)

# Q3 = train_cor["liveness"].quantile(0.75)

# IQR = Q3 - Q1

# Lower_Fence = Q1 - (1.5 * IQR)
# print(Lower_Fence)
# Upper_Fence = Q3 + (1.5 * IQR)
# print(Upper_Fence)
# print((train_cor['liveness'] < Lower_Fence).sum())
# print((train_cor['liveness'] >= Upper_Fence).sum())

In [None]:
# Q1 = train_cor["tempo"].quantile(0.25)

# Q3 = train_cor["tempo"].quantile(0.75)

# IQR = Q3 - Q1

# Lower_Fence = Q1 - (1.5 * IQR)
# print(Lower_Fence)
# Upper_Fence = Q3 + (1.5 * IQR)
# print(Upper_Fence)
# print((train_cor['tempo'] < Lower_Fence).sum())
# print((train_cor['tempo'] >= Upper_Fence).sum())

In [None]:
# Q1 = train_cor["duration_in min/ms"].quantile(0.25)

# Q3 = train_cor["duration_in min/ms"].quantile(0.75)

# IQR = Q3 - Q1

# Lower_Fence = Q1 - (1.5 * IQR)
# print(Lower_Fence)
# Upper_Fence = Q3 + (1.5 * IQR)
# print(Upper_Fence)
# print((train_cor['duration_in min/ms'] < Lower_Fence).sum())
# print((train_cor['duration_in min/ms'] >= Upper_Fence).sum())

In [None]:
# Q1 = train_cor["time_signature"].quantile(0.25)

# Q3 = train_cor["time_signature"].quantile(0.75)

# IQR = Q3 - Q1

# Lower_Fence = Q1 - (1.5 * IQR)
# print(Lower_Fence)
# Upper_Fence = Q3 + (1.5 * IQR)
# print(Upper_Fence)
# print((train_cor['time_signature'] < Lower_Fence).sum())
# print((train_cor['time_signature'] >= Upper_Fence).sum())

In [None]:

# conditions = [train_cor['mode'].eq(1) & train_cor["key"].isna(),
#               train_cor['mode'].eq(0) & train_cor["key"].isna()]

# what_to_do = [7.0, 11.0]
# # else_case = "X"

# train_cor["key_mode"] = np.select(conditions, what_to_do)

In [None]:
# train_cor[["key_mode","mode"]]

In [None]:
# train_cor['key_mode'].isna().sum()

In [None]:
# enc_1 = LabelEncoder()
# train_cor['Track Name_label'] = enc_1.fit_transform(train_cor['Track Name'].values.reshape(-1,1))

# list_r = []
# list_r.extend(train_cor['Track Name'])
# list_r.extend(test['Track Name'])
# TN = pd.DataFrame(list_r,columns=["Track Name"])
# a = TN["Track Name"].value_counts().sort_values(ascending = True).index
# enc_1 = OrdinalEncoder(categories=[a],dtype=int)
# enc_1.fit(a.values.reshape(-1,1))
# train_cor['Track Name_ordinal'] = enc_1.transform(train_cor['Track Name'].values.reshape(-1,1))

# enc_2 = LabelEncoder()
# train_cor['Artist Name_label'] = enc_2.fit_transform(train_cor['Artist Name'].values.reshape(-1,1))

# list_r = []
# list_r.extend(train_cor['Artist Name'])
# list_r.extend(test['Artist Name'])
# TN = pd.DataFrame(list_r,columns=["Artist Name"])
# a = TN["Artist Name"].value_counts().sort_values(ascending = True).index
# enc_2 = OrdinalEncoder(categories=[a],dtype=int)
# enc_2.fit(a.values.reshape(-1,1))
# train_cor['Artist Name_ordinal'] = enc_2.transform(train_cor['Artist Name'].values.reshape(-1,1))



# train_cor['Popularity_max'] = train_cor['Popularity'].fillna(float(train_cor['Popularity'].max()))
# train_cor['Popularity_min'] = train_cor['Popularity'].fillna(float(train_cor['Popularity'].min()))
# train_cor['Popularity_mean'] = train_cor['Popularity'].fillna(float(train_cor['Popularity'].mean()))

# train_cor['Popularity_without_outliers'] = np.where(train_cor['Popularity'] >= 90.5, 90.5, train_cor['Popularity'])
# train_cor['Popularity_max_without_outliers'] = np.where(train_cor['Popularity_max'] >= 90.5, 90.5, train_cor['Popularity_max'])
# train_cor['Popularity_min_without_outliers'] = np.where(train_cor['Popularity_min'] >= 90.5,90.5, train_cor['Popularity_min'])
# train_cor['Popularity_mean_without_outliers'] = np.where(train_cor['Popularity_mean'] >= 90.5, 90.5, train_cor['Popularity_mean'])

# train_cor['danceability_without_outliers'] = np.where(train_cor['danceability'] <0.09299999999999992, 0.09299999999999992, train_cor['danceability'])


# # train_cor['key_mode'] = np.where(train_cor['mode'] == 1, 7.0, train_cor['key'])
# # # print(train_cor['key'].isna().sum())
# # train_cor['key_mode'] = np.where(train_cor['mode'] == 0, 11.0, train_cor['key'])

# conditions = [train_cor['mode'].eq(1) & train_cor["key"].isna(),
#               train_cor['mode'].eq(0) & train_cor["key"].isna()]

# what_to_do = [7.0, 11.0]

# train_cor["key_mode"] = np.select(conditions, what_to_do)

# # train_cor['key_1'] = train_cor['key'].fillna(int(1))
# # train_cor['key_0'] = train_cor['key'].fillna(0)


# train_cor['loudness_without_outliers'] = np.where(train_cor['loudness'] <-16.102, -16.102, train_cor['loudness'])

# # train_cor['speechiness_3'] = np.where((train_cor['speechiness'] >= 0) & (train_cor['speechiness'] < 0.33),  0, train_cor['speechiness'])
# # train_cor['speechiness_3'] = np.where((train_cor['speechiness_3'] >= 0.33) & (train_cor['speechiness_3'] < 0.66),  0.5, train_cor['speechiness_3'])
# # train_cor['speechiness_3'] = np.where(train_cor['speechiness'] >= 0.66,  1, train_cor['speechiness'])

# train_cor['speechiness_without_outliers'] = np.where(train_cor['speechiness'] >0.15554999999999997, 0.15554999999999997, train_cor['speechiness'])

# train_cor['instrumentalness_min'] = train_cor['instrumentalness'].fillna(float(train_cor['instrumentalness'].min()))
# train_cor['instrumentalness_max'] = train_cor['instrumentalness'].fillna(float(train_cor['instrumentalness'].max()))
# train_cor['instrumentalness_mean'] = train_cor['instrumentalness'].fillna(float(train_cor['instrumentalness'].mean()))

# train_cor['instrumentalness_min_without_outliers'] = np.where(train_cor['instrumentalness_min'] >0.502368675, 0.502368675, train_cor['instrumentalness_min'])
# train_cor['instrumentalness_max_without_outliers'] = np.where(train_cor['instrumentalness_max'] >0.502368675, 0.502368675, train_cor['instrumentalness_max'])
# train_cor['instrumentalness_mean_without_outliers'] = np.where(train_cor['instrumentalness_mean'] >0.502368675, 0.502368675, train_cor['instrumentalness_mean'])

# train_cor['instrumentalness_without_outliers'] = np.where(train_cor['instrumentalness'] >0.502368675, 0.502368675, train_cor['instrumentalness'])


# train_cor['liveness_0.8'] = np.where(train_cor['liveness'] >= 0.8, 1, train_cor['liveness'])
# train_cor['liveness_0.8'] = np.where(train_cor['liveness_0.8'] < 0.8, 0, train_cor['liveness_0.8'])

# train_cor['liveness_without_outliers'] = np.where(train_cor['liveness'] < 0.4940875, 0.4940875, train_cor['liveness'])

# train_cor['tempo_without_outliers'] = np.where(train_cor['tempo'] > 205.27212499999996, 205.27212499999996, train_cor['tempo'])

# train_cor['duration_in min/ms_without_outliers'] = np.where(train_cor['duration_in min/ms'] < 35243.875,35243.875, train_cor['duration_in min/ms'])
# train_cor['duration_in min/ms_without_outliers'] = np.where(train_cor['duration_in min/ms_without_outliers'] > 382448.875,382448.875, train_cor['duration_in min/ms_without_outliers'])


In [None]:
# train_cor

In [None]:
# plt.figure(figsize = (20,20))
# sns.heatmap(train_cor.corr(),annot = True , cmap = 'coolwarm' );

In [None]:
# corr_matrix = train_cor.corr()
# corr_matrix["Class"].abs().sort_values(ascending=False)

In [37]:
train=pd.read_csv('../input/music-project-genre/train (1).csv')
test=pd.read_csv('../input/music-project-genre/test (2).csv')

In [38]:
targets = train['Class']
enc_1 = LabelEncoder()
list_r = []
list_r.extend(train['Track Name'])
list_r.extend(test['Track Name'])
enc_1.fit(list_r)
train['Track Name'] = enc_1.transform(train['Track Name'].values.reshape(-1,1))
test['Track Name'] = enc_1.transform(test['Track Name'].values.reshape(-1,1))


list_r = []
list_r.extend(train['Artist Name'])
list_r.extend(test['Artist Name'])
TN = pd.DataFrame(list_r,columns=["Artist Name"])
a = TN["Artist Name"].value_counts().sort_values(ascending = True).index
enc_2 = OrdinalEncoder(categories=[a],dtype=int)
enc_2.fit(a.values.reshape(-1,1))
train['Artist Name'] = enc_2.transform(train['Artist Name'].values.reshape(-1,1))
test['Artist Name'] = enc_2.transform(test['Artist Name'].values.reshape(-1,1))


# train['Artist Name'] = pd.Categorical(train['Artist Name'])
# categories = train['Artist Name'].cat.categories
# test['Artist Name'] = pd.Categorical(test['Artist Name'], categories)

# train['Artist Name'] = train['Artist Name'].cat.codes
# test['Artist Name'] = test['Artist Name'].cat.codes


# list_r = []
# list_r.extend(train['Artist Name'])
# list_r.extend(test['Artist Name'])
# TN = pd.DataFrame(list_r,columns=["Artist Name"])
# TN["Artist Name"] = TN["Artist Name"].astype("category")
# TN["Artist Name"] = TN["Artist Name"].cat.codes

# train['Artist Name'] = TN['Artist Name'].iloc[:len(train['Artist Name'])]
# test['Artist Name'] = TN['Artist Name'].iloc[len(train['Artist Name']):]

def clean_data(df):
    df['Popularity'] = df['Popularity'].fillna(float(df['Popularity'].min()))
    df['Popularity'] = np.where(df['Popularity'] >= 90.5,90.5, df['Popularity'])
#     df['key'] = df['key'].fillna(float(df['key'].mode()))

    df['danceability'] = np.where(df['danceability'] <0.09299999999999992, 0.09299999999999992, df['danceability'])

#     df['key'] = np.where(df['mode'] == 1, 7.0, df['key'])
#     df['key'] = np.where(df['mode'] == 0, 11.0, df['key'])


    df['loudness'] = np.where(df['loudness'] <-16.102, -16.102, df['loudness'])

    df['speechiness'] = np.where(df['speechiness'] >0.15554999999999997, 0.15554999999999997, df['speechiness'])

    df['instrumentalness'].fillna(float(df['instrumentalness'].min()), inplace=True)
    df['instrumentalness'] = np.where(df['instrumentalness'] >0.502368675, 0.502368675, df['instrumentalness'])

    df['liveness'] = np.where(df['liveness'] < 0.4940875, 0.4940875, df['liveness'])
    
    df['tempo'] = np.where(df['tempo'] > 205.27212499999996, 205.27212499999996, df['tempo'])

    df['duration_in min/ms'] = np.where(df['duration_in min/ms'] < 35243.875,35243.875, df['duration_in min/ms'])
    df['duration_in min/ms'] = np.where(df['duration_in min/ms'] > 382448.875,382448.875, df['duration_in min/ms'])

    return df

In [41]:
train = clean_data(train)
test = clean_data(test)

In [43]:
corr_matrix = train.corr()
corr_matrix["Class"].abs().sort_values(ascending=False)

In [44]:
train.drop(['Id','Class',"key"], axis=1, inplace=True)
test.drop(['Id',"key"], axis=1, inplace=True)

In [45]:
train

## Scale features
Scaling data means to make all values in all variables within the same range. MinMaxScaler() (i.e., to make all values between 0 and 1) is the most widely used scaling technique. However, StandardScaler() handles much better datasets with outliers. It makes all values centered around 0 with standard deviation of 1. The steps are executed below.

In [47]:
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

In [48]:
train.shape, targets.shape, test.shape

In [49]:
X_train, X_test, y_train, y_test = train_test_split( train, targets, test_size=0.2, random_state=42,shuffle=True, stratify=targets)

# Modelling Algos

In [51]:
f1_score_result = []
models = ["DummyClassifier","LogisticRegression",'SGDClassifier' ,'RidgeClassifier', 'KNeighborsClassifier','SVC',
          "Naive Bayes",'RandomForestClassifier',
          'DecisionTreeClassifier','AdaBoostClassifier','BaggingClassifier','ExtraTreesClassifier','GradientBoostingClassifier',
          'HistGradientBoostingClassifier','XGBClassifier','LGBMClassifier'
         ]

 F1 = 2 * (precision * recall) / (precision + recall)

In [52]:
def get_evalute(y_test,y_pred,f1_score_result):
    print("accuracy_score",accuracy_score(y_test,y_pred))
    print("f1_score",f1_score(y_test,y_pred,average="micro"))
#     print("f1_score",2 * (precision_score(y_test,y_pred,average="micro") * recall_score(y_test,y_pred,average="micro")) / (precision_score(y_test,y_pred,average="micro") + recall_score(y_test,y_pred,average="micro")))
    f1_score_result.append(f1_score(y_test,y_pred,average="micro"))
    print("precision_score",precision_score(y_test,y_pred,average="micro"))    
    print("recall_score",recall_score(y_test,y_pred,average="micro"))
    print(classification_report(y_test,y_pred))
    return f1_score_result

In [53]:
clf = DummyClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [54]:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
log_clf.fit(X_train, y_train)
y_pred = log_clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [55]:
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [56]:
clf = RidgeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [57]:
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [59]:
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [60]:
# Building Naive Bayes Classifier
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, y_train)
y_pred = naive_bayes_classifier.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)


In [61]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [62]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [63]:
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [64]:
clf = BaggingClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [65]:
clf = ExtraTreesClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [66]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [67]:
clf = HistGradientBoostingClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [None]:
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)
y_pred = xgbc.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [None]:
plot_importance(xgbc)

In [None]:
LGBMC = LGBMClassifier()
LGBMC.fit(X_train, y_train)
y_pred = LGBMC.predict(X_test)
f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [None]:
compare = pd.DataFrame({'Algorithms' : models , 'f1-score' : f1_score_result})
compare.sort_values(by='f1-score' ,ascending=False)

In [None]:
sns.factorplot(x='Algorithms', y='f1-score' , data=compare, size=6 , aspect=4)

# cross_val_score

In [74]:
# GBModel=GradientBoostingClassifier()
# GBModel_f1_scores = cross_val_score(GBModel, X_train, y_train,scoring="f1_micro", cv=5)
# # GBModel_f1_scores = np.sqrt(-GBModel_f1)
# print("scores:", GBModel_f1_scores)
# print("Mean:", GBModel_f1_scores.mean()) 

In [75]:
# HGBModel=HistGradientBoostingClassifier()
# HGBModel_f1_scores = cross_val_score(HGBModel, X_train, y_train,scoring="f1_micro", cv=5)
# # GBModel_f1_scores = np.sqrt(-GBModel_f1)
# print("scores:", HGBModel_f1_scores)
# print("Mean:", HGBModel_f1_scores.mean()) 

In [76]:
# LGBModel=LGBMClassifier()
# LGBModel_f1_scores = cross_val_score(LGBModel, X_train, y_train,scoring="f1_micro", cv=5)
# # GBModel_f1_scores = np.sqrt(-GBModel_f1)
# print("scores:", LGBModel_f1_scores)
# print("Mean:", LGBModel_f1_scores.mean()) 

In [77]:
# rfclf=RandomForestClassifier()
# rfclf_f1_scores = cross_val_score(rfclf, X_train, y_train,scoring="f1_micro", cv=5)
# # GBModel_f1_scores = np.sqrt(-GBModel_f1)
# print("scores:", rfclf_f1_scores)
# print("Mean:", rfclf_f1_scores.mean()) 

# Voting Classifiers

In [78]:
# log_clf = LogisticRegression(solver="lbfgs", random_state=42)
# rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
# svm_clf = SVC(gamma="scale", random_state=42)
# GB_clf = GradientBoostingClassifier()
# voting_clf = VotingClassifier(
#     estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf),("GB",GB_clf)],
#     voting='hard')
# # voting_clf.fit(X_train, y_train)
# for clf in (log_clf, rnd_clf, svm_clf,GB_clf, voting_clf):
#     clf.fit(X_train, y_train)
#     y_pred = clf.predict(X_test)
#     print(clf.__class__.__name__, accuracy_score(y_test, y_pred),f1_score(y_test,y_pred,average="micro"))

In [79]:
# log_clf = LogisticRegression(solver="lbfgs", random_state=42)
# rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
# svm_clf = SVC(gamma="scale", probability=True, random_state=42)
# GB_clf = GradientBoostingClassifier()
# voting_clf = VotingClassifier(
#     estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf),("GB",GB_clf)],
#     voting='soft')
# # voting_clf.fit(X_train, y_train)
# # for clf in (log_clf, rnd_clf, svm_clf,GB_clf):
# #     clf.fit(X_train, y_train)
# #     y_pred = clf.predict(X_test)
# #     print(clf.__class__.__name__, accuracy_score(y_test, y_pred),f1_score(y_test,y_pred,average="micro"))
# voting_clf.fit(X_train, y_train)
# y_pred = clf.predict_proba(X_test)
# print(clf.__class__.__name__, accuracy_score(y_test, y_pred),f1_score(y_test,y_pred,average="micro"))

# Bagging and Pasting

In [80]:
# bag_DT_clf = BaggingClassifier(
#     DecisionTreeClassifier(), n_estimators=500,
#     max_samples=100, bootstrap=True, random_state=42)
# bag_DT_clf.fit(X_train, y_train)
# y_pred = bag_DT_clf.predict(X_test)
# accuracy_score(y_test, y_pred),f1_score(y_test,y_pred,average="micro")

In [81]:
# bag_GB_clf = BaggingClassifier(
#     GradientBoostingClassifier(), n_estimators=500,
#     max_samples=100, bootstrap=True, random_state=42)
# bag_GB_clf.fit(X_train, y_train)
# y_pred = bag_GB_clf.predict(X_test)
# accuracy_score(y_test, y_pred),f1_score(y_test,y_pred,average="micro")

In [82]:
# bag_HB_clf = BaggingClassifier(
#     HistGradientBoostingClassifier(), n_estimators=500,
#     max_samples=100, bootstrap=True, random_state=42)
# bag_HB_clf.fit(X_train, y_train)
# y_pred = bag_HB_clf.predict(X_test)
# accuracy_score(y_test, y_pred),f1_score(y_test,y_pred,average="micro")

In [None]:
# bag_LGM_clf = BaggingClassifier(
#     LGBMClassifier(), n_estimators=500,
#     max_samples=100, bootstrap=True, random_state=42)
# bag_LGM_clf.fit(X_train, y_train)
# y_pred = bag_LGM_clf.predict(X_test)
# accuracy_score(y_test, y_pred),f1_score(y_test,y_pred,average="micro")

# Out-of-Bag Evaluation

In [None]:
# bag_clf = BaggingClassifier(
#     DecisionTreeClassifier(), n_estimators=500,
#     bootstrap=True, oob_score=True, random_state=40)
# bag_clf.fit(X_train, y_train)
# bag_clf.oob_score_

In [None]:
# y_pred = bag_clf.predict(X_test)
# accuracy_score(y_test, y_pred),f1_score(y_test,y_pred,average="micro")

In [None]:
# bag_clf = BaggingClassifier(
#     GradientBoostingClassifier(), n_estimators=500,
#     bootstrap=True, oob_score=True, random_state=40)
# bag_clf.fit(X_train, y_train)
# bag_clf.oob_score_ #0.5788468218131295

In [None]:
# y_pred = bag_clf.predict(X_test)
# accuracy_score(y_test, y_pred),f1_score(y_test,y_pred,average="micro")# 0.5711805555555556

# Boosting
AdaBoost

In [None]:
# ada_clf = AdaBoostClassifier(
#     DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1, random_state=42)
# ada_clf.fit(X_train, y_train)
# y_pred = ada_clf.predict(X_test)
# accuracy_score(y_test, y_pred)

Gradient Boosting


In [None]:
# gbrt = GradientBoostingClassifier(max_depth=1, n_estimators=100, learning_rate=0.1, random_state=42)
# gbrt.fit(X_train, y_train)
# y_pred = gbrt.predict(X_test)
# accuracy_score(y_test, y_pred) #0.5239583333333333

# Stacking

In [None]:
# from sklearn.pipeline import make_pipeline
# estimators = [
#     ('rf', GradientBoostingClassifier(n_estimators=10, random_state=42)),
#     ('svr', make_pipeline(StandardScaler(),
#                           LinearSVC(random_state=42)))
# ]
# clf = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier())
# clf.fit(X_train, y_train).score(X_test, y_test)

# GridSearchCV

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier

# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score
# from sklearn.metrics import make_scorer
# #creating Scoring parameter: 
# scoring = {'accuracy': make_scorer(accuracy_score),
#            'precision': make_scorer(precision_score),'recall':make_scorer(recall_score)}

# # A sample parameter

# parameters = {
#     "loss":["deviance"],
#     "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
#     "min_samples_split": np.linspace(0.1, 0.5, 12),
#     "min_samples_leaf": np.linspace(0.1, 0.5, 12),
#     "max_depth":[3,5,8],
#     "max_features":["log2","sqrt"],
#     "criterion": ["friedman_mse",  "mae"],
#     "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     "n_estimators":[10]
#     }
# #passing the scoring function in the GridSearchCV
# clf = GridSearchCV(GradientBoostingClassifier(), parameters,scoring=scoring,refit=False,cv=2, n_jobs=-1)

# clf.fit(X_train, y_train)
# #converting the clf.cv_results to dataframe
# df=pd.DataFrame.from_dict(clf.cv_results_)
# #here Possible inputs for cross validation is cv=2, there two split split0 and split1
# df[['split0_test_accuracy','split1_test_accuracy','split0_test_precision','split1_test_precision','split0_test_recall','split1_test_recall']]

In [None]:
# df['accuracy_score']=(df['split0_test_accuracy']+df['split1_test_accuracy'])/2

# df.loc[df['accuracy_score'].idxmax()]['params']

In [None]:
# clf =GradientBoostingClassifier(criterion='mae',learning_rate=0.1,loss='deviance',max_depth= 5,max_features='sqrt',min_samples_leaf= 0.1,
#                                 min_samples_split= 0.42727272727272736,n_estimators=10,subsample=0.8)
# clf.fit(X_train, y_train)
# # correct_test = correct_data(test)
# # testX = correct_test[predictor].values
# result = clf.predict(X_test)
# accuracy_score(y_test, y_pred)

In [None]:
# parameters = {'learning_rate': [0.01,0.05,0.1],
#                   'subsample'    : [0.9, 0.5, 0.2],
#                   'n_estimators' : [100,500,1000],
#                   'max_depth'    : [4,6,8]
#                  }
# GBC = GradientBoostingClassifier()
# grid_GBC = GridSearchCV(estimator=GBC, param_grid = parameters, cv = 5, n_jobs=-1)
# grid_GBC.fit(X_train, y_train)
# print(" Results from Grid Search")
# print("\n The best estimator across ALL searched params:\n",grid_GBC.best_estimator_)
# print("\n The best score across ALL searched params:\n",grid_GBC.best_score_)
# print("\n The best parameters across ALL searched params:\n",grid_GBC.best_params_)


#  Results from Grid Search

#  The best estimator across ALL searched params:
#  GradientBoostingClassifier(learning_rate=0.01, max_depth=4, n_estimators=500,
#                            subsample=0.5)

#  The best score across ALL searched params:
#  0.5738095991942875

#  The best parameters across ALL searched params:
#  {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.5}



In [None]:
# p_test3 = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[100,250,500,750,1000,1250,1500,1750]}

# tuning = GridSearchCV(estimator =GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
#             param_grid = p_test3, scoring='accuracy',n_jobs=4, cv=5)
# tuning.fit(X_train,y_train)
# tuning.best_params_, tuning.best_score_

In [None]:
# p_test2 = {'max_depth':[2,3,4,5,6,7] }
# tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.01,n_estimators=1500, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10), 
#             param_grid = p_test2, scoring='accuracy',n_jobs=4, cv=5)
# tuning.fit(X_train,y_train)
# tuning.best_estimator_, tuning.best_params_, tuning.best_score_

In [None]:
# parameters = {
#     "loss":["deviance"],
#     "learning_rate": [0.01, 0.05, 0.075, 0.1],
#     "min_samples_split": np.linspace(0.1, 0.5, 5),
#     "min_samples_leaf": np.linspace(0.1, 0.5, 5),
#     "max_depth":[3,5,8],
#     "max_features":["log2","sqrt"],
#     "criterion": ["friedman_mse",  "mae"],
#     "subsample":[0.5, 0.618, 0.8, 0.9, 0.95, 1.0],
#     "n_estimators":[500]
#     }

# clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10, n_jobs=-1)

# clf.fit(X_train, y_train)
# print(clf.score(X_train, y_train))
# print(clf.best_params_)

In [None]:
# p_test4 = {'min_samples_split':[2,4,6,8,10,20,40,60,100], 'min_samples_leaf':[1,3,5,7,9]}

# tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=0.01, n_estimators=1500,max_depth=4, subsample=1,max_features='sqrt', random_state=10), 
#             param_grid = p_test4, scoring='accuracy',n_jobs=4, cv=5)
# tuning.fit(X_train,y_train)
# tuning.best_params_, tuning.best_score_

In [None]:
# from sklearn import metrics
# from sklearn.metrics import roc_auc_score
# from sklearn.metrics import roc_curve 


# #run models
# baseline = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
# baseline.fit(X_train,y_train)

# y_pred = baseline.predict(X_test)
# print(accuracy_score(y_test, y_pred))

# model1 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=1500,max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
# model1.fit(X_train,y_train)

# y_pred = model1.predict(X_test)
# print(accuracy_score(y_test, y_pred))

# new=GradientBoostingClassifier(learning_rate=0.01, n_estimators=1500,max_depth=4, min_samples_split=40, min_samples_leaf=7,max_features=4 , subsample=0.95, random_state=10)
# new.fit(X_train,y_train)

# y_pred = new.predict(X_test)
# print(accuracy_score(y_test, y_pred))


In [None]:
# param_grid = {
#     'num_leaves': [31, 127],
#     'reg_alpha': [0.1, 0.5],
#     'min_data_in_leaf': [30, 50, 100, 300, 400],
#     'lambda_l1': [0, 1, 1.5],
#     'lambda_l2': [0, 1]
#     }

# lgb_estimator = LGBMClassifier()
# # lgb_estimator = LGBMClassifier()

# gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_grid,scoring="accuracy" ,cv=5)
# lgb_model = gsearch.fit(X=X_train, y=y_train)
# print(lgb_model.score(X_train, y_train))
# print(lgb_model.best_params_)
# print(lgb_model.best_score_)
# lgb_model.score(X_test, y_test)

In [None]:

# xgbc = LGBMClassifier(  lambda_l1= 1.5, lambda_l2= 0,min_data_in_leaf= 100, num_leaves= 31, reg_alpha= 0.1)
# xgbc.fit(X_train, y_train)
# y_pred = xgbc.predict(X_test)
# f1_score_result = get_evalute(y_test,y_pred,f1_score_result)

In [None]:
# gridParams = {
#     'learning_rate': [0.005, 0.01],
#     'n_estimators': [8,16,24],
#     'num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
#     'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
#     'objective' : ['binary'],
#     'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
#     'random_state' : [500],
#     'colsample_bytree' : [0.64, 0.65, 0.66],
#     'subsample' : [0.7,0.75],
#     'reg_alpha' : [1,1.2],
#     'reg_lambda' : [1,1.2,1.4],
#     }
# # lgb_estimator = LGBMClassifier()

# grid = GridSearchCV(LGBMClassifier(), gridParams, verbose=1, cv=4, n_jobs=-1)
# # Run the grid
# grid.fit(X_train, y_train)

# # Print the best parameters found
# print(grid.best_params_)
# print(grid.best_score_)

In [None]:
# lgb=LGBMClassifier()
# #Define the parameters
# parameters = {'num_leaves':[20,40,60,80,100], 'min_child_samples':[5,10,15],'max_depth':[-1,5,10,20],
#              'learning_rate':[0.05,0.1,0.2],'reg_alpha':[0,0.01,0.03]}
# #Define the scoring
# clf=GridSearchCV(lgb,parameters,scoring='accuracy')
# clf.fit(X=X_train, y=y_train)
# print(clf.best_params_)
# predicted=clf.predict(X_test)
# print('Classification of the result is:')
# print(accuracy_score(y_test, predicted))
# # {'learning_rate': 0.05, 'max_depth': 5, 'min_child_samples': 10, 'num_leaves': 20, 'reg_alpha': 0.03}
# # Classification of the result is:
# # 0.5746527777777778

In [None]:
# clf = LGBMClassifier(learning_rate= 0.05, max_depth= -1, min_child_samples= 15, num_leaves= 20, reg_alpha= 0)
# clf.fit(X_train,y_train)
# predicted=clf.predict(X_test)
# print('Classification of the result is:')
# print(accuracy_score(y_test, predicted))# 0.5711805555555556 #0.575

In [None]:
# clf = LGBMClassifier(learning_rate= 0.05, max_depth= -1, min_child_samples= 15, num_leaves= 20, reg_alpha= 0)
# clf.fit(train,targets)

In [83]:
clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, n_estimators=800,subsample=0.6)
clf.fit(X_train,y_train)
predicted=clf.predict(X_test)
print('Classification of the result is:')
print(accuracy_score(y_test, predicted)) # 0.5788194444444444

In [None]:
clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, n_estimators=800,subsample=0.6)
clf.fit(train,targets)

In [None]:
y_pred=clf.predict(test)
y_pred

In [None]:
y_pred.shape

In [None]:
submission_file = pd.read_csv("/content/drive/MyDrive/ShAi/Task 11/music project/sample_submission (2).csv")
print(submission_file)
submission_file['Class'] = y_pred
submission_file.to_csv('/content/drive/MyDrive/ShAi/Task 11/music project/submission.csv', index=False)