# Test d'utilisation de catboost pour la prédiction multiclasse des chansons

## 1. Dataset exploration

In [1]:
import pandas as pd
import os
from catboost import CatBoostClassifier, Pool
import numpy as np

print(os.getcwd())
df = pd.read_csv("../classificationDataset.csv", sep = ";")
print(len(df))

c:\Users\thoma\OneDrive - CentraleSupelec\NOPLP\code\ML
202389


In [2]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,id,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,deltadate20k,deltadate20kma,tauxchoisies,categorie,Chanson_id
41847,26,91,Vous permettez Monsieur,1964,1960,Adamo Salvatore,2,59.0,,,318.0,32.0,0.333333,,
114353,923,1526,Emma,1998,1990,Matmatah,2,47.0,,,47.0,47.0,0.333333,,
153594,2107,2340,Forrest,2021,2020,Soprano,3,,,,,35.0,0.0,,
130247,2026,1108,Puisque vous partez en voyage,2000,2000,Hardy Françoise,3,112.0,,120.0,,43.0,1.0,,
71702,484,822,C'est extra,1969,1960,Ferré Léo,2,79.0,,,245.0,182.0,0.25,,


On vire les colonnes qui ne servent à rien

In [3]:
df = df.drop(columns = ['Unnamed: 0', 'id', 'Chanson_id'])

On remplace les labels par des entiers

In [4]:
categories = {1: '50', 2: '40', 3: '30', 4: '20', 5: '10', 6: 'MC', 7: '20k'}
reversed_cat = {val: key for (key, val) in categories.items()}
reversed_cat[None] = 8
reversed_cat

{'50': 1, '40': 2, '30': 3, '20': 4, '10': 5, 'MC': 6, '20k': 7, None: 8}

In [5]:
df = df.replace({'categorie': reversed_cat})
df.sample(5)

Unnamed: 0,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,deltadate20k,deltadate20kma,tauxchoisies,categorie
136140,Il en faut peu pour être heureux,1967,1960,Disney,4,65.0,,222.0,204.0,103.0,0.4,8
188361,Merci patron,1971,1970,Les Charlots,3,281.0,,,281.0,281.0,1.0,8
35812,La mauvaise réputation,1952,1950,Brassens Georges,3,202.0,,234.0,317.0,35.0,0.5,8
152214,Les filles d'aujourd'hui,2016,2010,Jonathan Joyce,0,216.0,132.0,132.0,,58.0,0.666667,8
55860,Frérot,2017,2010,Black M,3,173.0,,,,47.0,0.0,8


# 2. Test de catboost

In [21]:
from catboost import CatBoostClassifier, Pool
import numpy as np

In [22]:
# initialize data
train_data = np.random.randint(0,
                               100, 
                               size=(100, 10))

train_labels = np.random.randint(0,
                                 2,
                                 size=(100))

test_data = catboost_pool = Pool(train_data, 
                                 train_labels)

In [23]:
model = CatBoostClassifier(iterations=2,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)
# train the model
model.fit(train_data, train_labels)

0:	learn: 0.6434087	total: 145ms	remaining: 145ms
1:	learn: 0.6300723	total: 146ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x297e2e77788>

In [24]:
# make the prediction using the resulting model
preds_class = model.predict(test_data)
preds_proba = model.predict_proba(test_data)
print("class = ", preds_class)
print("proba = ", preds_proba)

class =  [1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 1 1 0 1 1 0
 1 0 0 0 0 0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1]
proba =  [[0.37217929 0.62782071]
 [0.37217929 0.62782071]
 [0.69124534 0.30875466]
 [0.69124534 0.30875466]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.5575301  0.4424699 ]
 [0.69124534 0.30875466]
 [0.37074373 0.62925627]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.24902271 0.75097729]
 [0.69124534 0.30875466]
 [0.37217929 0.62782071]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.69124534 0.30875466]
 [0.51297829 0.48702171]
 [0.37217929 0.62782071]
 [0.37074373 0.62925627]
 [0.51297829 0.48702171]
 [0.5575301  0.4424699 ]
 [0.24902271 0.75097729]
 [0.37074373 0.62925627]
 [0.37217929 0.62782071]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0

# 3. Entrainement du catboost

In [6]:
df.dtypes

titre              object
année               int64
decennie            int64
artiste            object
clusterid           int64
deltadate         float64
deltadatemc       float64
deltadatemcma     float64
deltadate20k      float64
deltadate20kma    float64
tauxchoisies      float64
categorie           int64
dtype: object

Split Train / Test

In [7]:
train = df[0:int(len(df)*0.8)]
print(len(train))
test = df[int(len(df)*0.8)+1:]
print(len(test))

train_labels = train['categorie']
train = train.drop(columns = ['categorie'])
train_data = train
test_labels = test['categorie']
test = test.drop(columns = ['categorie'])
test_data = test

test_pool = Pool(test_data, 
                 test_labels,
                 cat_features = ['titre', 'artiste'])

161911
40477


Entrainement

In [8]:
model = CatBoostClassifier(iterations=10,
                           depth=10,
                           learning_rate=1,
                           loss_function='MultiClass',
                           verbose=True,
                           class_weights = [0.025, 0.025, 0.025, 0.025, 0.025, 0.425, 0.425, 0.025])
# train the model
model.fit(train_data, train_labels, cat_features = ['titre', 'artiste'])

0:	learn: 0.3473378	total: 2.68s	remaining: 24.1s
1:	learn: 8.8102078	total: 5.45s	remaining: 21.8s
2:	learn: 64.2975698	total: 7.65s	remaining: 17.9s
3:	learn: 62.8119670	total: 9.93s	remaining: 14.9s
4:	learn: 60.2285456	total: 12.2s	remaining: 12.2s
5:	learn: 56.9167166	total: 14.4s	remaining: 9.63s
6:	learn: 54.4702419	total: 16.8s	remaining: 7.19s
7:	learn: 52.4471920	total: 19.5s	remaining: 4.87s
8:	learn: 42.1855904	total: 21.9s	remaining: 2.43s
9:	learn: 39.6030387	total: 24.5s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x20ffa099688>

In [9]:
# make the prediction using the resulting model
preds_class = model.predict(test_pool)
preds_proba = model.predict_proba(test_pool)

In [10]:
model.feature_names_

['titre',
 'année',
 'decennie',
 'artiste',
 'clusterid',
 'deltadate',
 'deltadatemc',
 'deltadatemcma',
 'deltadate20k',
 'deltadate20kma',
 'tauxchoisies']

In [11]:
model.feature_importances_

array([0.00000000e+00, 4.62569302e+00, 3.73805086e-06, 0.00000000e+00,
       3.72375527e+01, 1.34286662e+01, 7.29261579e-01, 2.02328738e+00,
       7.27206843e+00, 1.23448677e+01, 2.23385992e+01])

Sauvegarde du modèle

In [12]:
model.save_model("catboostModel.cbm",
           format="cbm",
           export_parameters=None,
           pool=None)

In [13]:
test_data["pred"] = preds_class
test_data["labels"] = test_labels
test_data[test_data['pred'] != 8].sample(10)

Unnamed: 0,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,deltadate20k,deltadate20kma,tauxchoisies,pred,labels
176005,Aujourd'hui peut-être,1945,1940,Sardou Fernand,3,203.0,,,409.0,409.0,0.333333,7,8
170672,La groupie du pianiste,1980,1980,Berger Michel,4,47.0,367.0,83.0,,64.0,1.0,7,8
173540,Il tape sur des bambous,1982,1980,Lavil Philippe,4,79.0,,,79.0,42.0,0.4,7,8
202154,Immortelle,2002,2000,Fabian Lara,4,89.0,,35.0,471.0,80.0,0.571429,7,8
184850,La peine maximum,2000,2000,Les Dix Commandements,0,113.0,412.0,146.0,,242.0,0.4,1,8
187678,Harley Davidson,1967,1960,Bardot Brigitte,0,55.0,94.0,94.0,,216.0,0.6,7,8
186230,Le dilemme,2000,2000,Les Dix Commandements,0,92.0,147.0,147.0,,243.0,1.0,7,8
182459,La ballade de Jim,1986,1980,Souchon Alain,0,105.0,,96.0,381.0,31.0,0.5,7,8
166174,Dis-moi,2007,2000,BB Brunes,4,48.0,128.0,128.0,,206.0,0.8,3,8
194946,La dernière séance,1977,1970,Mitchell Eddy,1,38.0,411.0,53.0,,42.0,1.0,6,8


In [14]:
import sklearn.metrics as skl

skl.confusion_matrix(test_data['labels'], test_data['pred'])

array([[    0,     0,     0,     0,     0,     0,     4,    72],
       [    0,     0,     0,     0,     0,     5,     3,    68],
       [    0,     0,     0,     0,     0,     3,     2,    69],
       [    0,     0,     1,     1,     0,     1,     2,    66],
       [    0,     0,     0,     0,     0,     0,     0,     4],
       [    0,     0,     0,     0,     0,     1,     2,    36],
       [    0,     0,     0,     0,     0,     0,     0,    76],
       [    9,    44,    16,    33,     0,   318,   747, 38894]],
      dtype=int64)

On regarde les chansons qui d'après CatBoost avaient le plus de chances de tomber en même chanson

In [15]:
test_data['probaMC'] = preds_proba[:,-3]
test_data['probaPassePas'] = preds_proba[:,-1]
test_data.sort_values(by = 'probaMC', ascending = False)[1:20]

Unnamed: 0,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,deltadate20k,deltadate20kma,tauxchoisies,pred,labels,probaMC,probaPassePas
179931,J'ai tout oublié,2001,2000,Lavoine Marc,0,60.0,179.0,73.0,,78.0,0.8,6,8,1.0,9.084493e-30
189607,Nougayork,1987,1980,Nougaro Claude,0,47.0,97.0,51.0,47.0,47.0,0.571429,6,8,1.0,2.957758e-21
191730,Nougayork,1987,1980,Nougaro Claude,0,48.0,98.0,52.0,48.0,48.0,0.571429,6,8,1.0,1.420437e-21
184314,Tatoue-moi,2008,2000,Mozart l'Opéra Rock,4,35.0,74.0,74.0,,272.0,0.8,6,8,1.0,8.635193e-18
186436,Tatoue-moi,2008,2000,Mozart l'Opéra Rock,4,36.0,75.0,75.0,,273.0,0.8,6,8,1.0,8.635193e-18
175118,Tata Yoyo,1980,1980,Cordy Annie,0,37.0,173.0,173.0,,59.0,0.6,6,8,1.0,7.198784e-09
183398,Quelques mots d'amour,1980,1980,Berger Michel,0,47.0,120.0,90.0,,71.0,0.428571,6,8,1.0,1.210919e-08
169716,Bidon,1976,1970,Souchon Alain,0,47.0,118.0,89.0,,39.0,0.333333,6,8,1.0,1.210919e-08
185520,Quelques mots d'amour,1980,1980,Berger Michel,0,48.0,121.0,91.0,,72.0,0.428571,6,8,1.0,1.630981e-08
171836,Bidon,1976,1970,Souchon Alain,0,48.0,119.0,90.0,,40.0,0.333333,6,8,1.0,1.630981e-08


Pareil avec les chansons qui ont le plus de chances de tomber pour 20k

In [16]:
test_data['proba20k'] = preds_proba[:,-2]
test_data['probaPassePas'] = preds_proba[:,-1]
test_data.sort_values(by = 'proba20k', ascending = False)[1:20]

Unnamed: 0,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,deltadate20k,deltadate20kma,tauxchoisies,pred,labels,probaMC,probaPassePas,proba20k
194001,Où je vais,2009,2000,Bent Amel,1,80.0,,214.0,275.0,47.0,0.555556,7,8,4.062722e-162,2.225074e-308,1.0
181266,Où je vais,2009,2000,Bent Amel,1,73.0,,207.0,268.0,40.0,0.555556,7,8,4.062722e-162,2.225074e-308,1.0
191878,Où je vais,2009,2000,Bent Amel,1,78.0,,212.0,273.0,45.0,0.555556,7,8,4.062722e-162,2.225074e-308,1.0
174493,Confidence pour confidence,1981,1980,Schultheis Jean,1,50.0,,,270.0,270.0,0.571429,7,8,2.20694e-46,4.983833e-89,1.0
184137,Les rues de ma peine,2018,2010,Amir,1,49.0,,57.0,152.0,77.0,0.571429,7,8,2.715925e-46,7.019849000000001e-43,1.0
198247,Où je vais,2009,2000,Bent Amel,1,82.0,,32.0,277.0,49.0,0.555556,7,8,3.72099e-210,2.225074e-308,1.0
169749,Je n'ai que mon âme,2001,2000,St-Pier Natasha,1,109.0,39.0,39.0,381.0,201.0,0.25,7,8,8.775925e-71,2.356015e-133,1.0
183388,Où je vais,2009,2000,Bent Amel,1,74.0,,208.0,269.0,41.0,0.555556,7,8,4.062722e-162,2.225074e-308,1.0
186662,Dans un vieux rock'n'roll,1976,1970,Sheller William,2,46.0,,,212.0,212.0,0.2,7,8,1.812343e-40,9.528568e-79,1.0
189342,Confidence pour confidence,1981,1980,Schultheis Jean,1,58.0,,,278.0,278.0,0.571429,7,8,2.20694e-46,4.983833e-89,1.0


## 4. Prédiction sur un nouveau dataset

In [17]:
import util
dateSimule = "2022-07-14"
# Requête
print("Querying data...")
conn, cur = util.connexion()
df = pd.read_sql_query(
    "SELECT * FROM public.\"GenereDatasetClassif\"('{0}', ('{0}'::date - INTERVAL'30 day')::date) WHERE \"année\" <= {1}".format(dateSimule, dateSimule[0:4]), con=conn)


Querying data...


In [18]:
# Data processing
print("Processing data...")
df = df.drop(columns=['id', 'Chanson_id'])
reversed_cat = {'50': 1, '40': 2, '30': 3,
                '20': 4, '10': 5, 'MC': 6, '20k': 7, None: 8}
df = df.replace({'categorie': reversed_cat})
test_labels = df['categorie']
test_data = df.drop(columns=['categorie'])
test_pool = Pool(test_data,
                    test_labels,
                    cat_features=['titre', 'artiste'])

Processing data...


In [19]:
# Chargement du modèle
print("Model loading...")
from_file = CatBoostClassifier()
model = from_file.load_model("catboostModel.cbm", format="cbm")

Model loading...


In [20]:
# Prédiction
print("Predicting probabilities...")
test_data["pred"] = model.predict(test_pool)
preds_proba = model.predict_proba(test_pool)
test_data["proba50"] = preds_proba[:,0]
test_data["proba40"] = preds_proba[:,1]
test_data["proba30"] = preds_proba[:,2]
test_data["proba20"] = preds_proba[:,3]
test_data["proba10"] = preds_proba[:,4]
test_data["probaMC"] = preds_proba[:,5]
test_data["proba20k"] = preds_proba[:,6]
test_data["probaPP"] = preds_proba[:,7]

Predicting probabilities...


In [28]:
test_data[test_data['artiste'] == 'Calogero'].sort_values('probaMC', ascending = False)

Unnamed: 0,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,deltadate20k,deltadate20kma,tauxchoisies,pred,proba50,proba40,proba30,proba20,proba10,probaMC,proba20k,probaPP
1565,Si seulement je pouvais lui manquer,2004,2000,Calogero,3,150.0,510.0,76.0,395.0,80.0,1.0,6,0.006827723,0.0104333,0.006910717,0.007308144,0.007307111,0.8041064,0.007121232,0.149985
2094,Un jour au mauvais endroit,2014,2010,Calogero,4,252.0,180.0,76.0,440.0,80.0,0.714286,8,0.0007607442,0.001162478,0.0007699914,0.0008142727,0.0008141575,0.08959345,0.000793447,0.905291
266,C'est dit,2009,2000,Calogero,0,85.0,230.0,76.0,,80.0,0.555556,8,0.0005085153,9.828549e-05,0.0003611474,0.0001253227,7.959325e-05,0.01302668,0.003356442,0.982444
1554,Je joue de la musique,2017,2010,Calogero,0,63.0,460.0,76.0,,80.0,0.75,8,0.0001660484,8.346491e-05,0.0003167599,0.0001493813,6.271194e-05,0.009896434,0.009280696,0.980045
253,On fait comme si,2020,2020,Calogero,3,80.0,,76.0,80.0,80.0,0.25,8,0.02340516,0.006883104,0.003763062,0.003770626,0.001245668,0.0009670814,0.01938913,0.940576
248,Avant toi (Calogero),2015,2010,Calogero,3,,,76.0,,80.0,0.0,8,0.0001187914,7.519541e-05,0.0001242789,0.0005281415,5.59108e-05,5.178494e-05,8.092414e-05,0.998965
254,Prendre racine,2003,2000,Calogero,4,99.0,,76.0,,80.0,0.461538,8,0.0006814597,3.374002e-05,0.0003070794,0.0001278363,2.375846e-05,2.274564e-05,0.0005973608,0.998206
95,1987,2018,2010,Calogero,3,147.0,,76.0,218.0,80.0,0.666667,8,3.621689e-05,4.578763e-05,5.442032e-05,9.488173e-06,9.059796e-06,9.907921e-06,0.0001438007,0.999691
268,Yalla,2004,2000,Calogero,0,112.0,76.0,76.0,703.0,80.0,0.909091,8,1.505607e-05,3.616232e-05,4.451166e-05,1.059473e-05,6.514583e-06,1.275034e-06,1.073242e-06,0.999885
96,Pomme C,2007,2000,Calogero,1,59.0,269.0,76.0,538.0,80.0,0.75,8,0.000406914,6.352058e-05,5.146483e-05,0.0003060679,4.624551e-05,1.243703e-07,0.00662609,0.9925
