# Test d'utilisation de catboost pour la prédiction multiclasse des chansons

## 1. Dataset exploration

In [20]:
import pandas as pd
import os
from catboost import CatBoostClassifier, Pool
import numpy as np

print(os.getcwd())
df = pd.read_csv("../classificationDataset.csv", sep = ";")
print(len(df))

c:\Users\thoma\OneDrive - CentraleSupelec\NOPLP\code\ML
94120


In [3]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,id,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,tauxchoisies,categorie,Chanson_id
2844,860,1431,Les nuits parisiennes,1997,1990,Louise Attaque,3,66.0,,,0.166667,,
91854,1778,2309,1 + 1,2021,2020,Amir,1,,,40.0,0.0,,
5276,1306,2298,À la santé des gens que j'aime,2021,2020,Bruel Patrick,1,,,63.0,0.0,,
20871,1001,1763,Sympathique,1997,1990,Pink Martini,4,54.0,,,0.25,,
63444,1596,1205,Partons vite,2006,2000,Kaolin,4,59.0,,,0.428571,,


On vire les colonnes qui ne servent à rien

In [21]:
df = df.drop(columns = ['Unnamed: 0', 'id', 'Chanson_id'])

On remplace les labels par des entiers

In [16]:
categories = {1: '50', 2: '40', 3: '30', 4: '20', 5: '10', 6: 'MC', 7: '20k'}
reversed_cat = {val: key for (key, val) in categories.items()}
reversed_cat[None] = 8
reversed_cat

{'50': 1, '40': 2, '30': 3, '20': 4, '10': 5, 'MC': 6, '20k': 7, None: 8}

In [22]:
df = df.replace({'categorie': reversed_cat})
df.sample(5)

Unnamed: 0,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,tauxchoisies,categorie
38893,Le petit bonhomme en mousse,1999,1990,Sébastien Patrick,4,91.0,,240.0,0.5,8
2044,L'épervier,1966,1960,Aufray Hugues,1,298.0,,,0.0,8
39006,Diabolo menthe,1978,1970,Simon Yves,0,61.0,,,0.5,8
41703,Le temps qui court,1975,1970,Chamfort Alain,2,76.0,37.0,37.0,1.0,8
64671,Be bop a lula 1960,1960,1960,Les Chaussettes Noires,1,90.0,,,0.0,8


# 2. Test de catboost

In [21]:
from catboost import CatBoostClassifier, Pool
import numpy as np

In [22]:
# initialize data
train_data = np.random.randint(0,
                               100, 
                               size=(100, 10))

train_labels = np.random.randint(0,
                                 2,
                                 size=(100))

test_data = catboost_pool = Pool(train_data, 
                                 train_labels)

In [23]:
model = CatBoostClassifier(iterations=2,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)
# train the model
model.fit(train_data, train_labels)

0:	learn: 0.6434087	total: 145ms	remaining: 145ms
1:	learn: 0.6300723	total: 146ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x297e2e77788>

In [24]:
# make the prediction using the resulting model
preds_class = model.predict(test_data)
preds_proba = model.predict_proba(test_data)
print("class = ", preds_class)
print("proba = ", preds_proba)

class =  [1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 1 1 0 1 1 0
 1 0 0 0 0 0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1]
proba =  [[0.37217929 0.62782071]
 [0.37217929 0.62782071]
 [0.69124534 0.30875466]
 [0.69124534 0.30875466]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.5575301  0.4424699 ]
 [0.69124534 0.30875466]
 [0.37074373 0.62925627]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.24902271 0.75097729]
 [0.69124534 0.30875466]
 [0.37217929 0.62782071]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0.69124534 0.30875466]
 [0.51297829 0.48702171]
 [0.37217929 0.62782071]
 [0.37074373 0.62925627]
 [0.51297829 0.48702171]
 [0.5575301  0.4424699 ]
 [0.24902271 0.75097729]
 [0.37074373 0.62925627]
 [0.37217929 0.62782071]
 [0.51297829 0.48702171]
 [0.51297829 0.48702171]
 [0

# 3. Entrainement du catboost

In [43]:
df.dtypes

titre             object
année              int64
decennie           int64
artiste           object
clusterid          int64
deltadate        float64
deltadatemc      float64
deltadatemcma    float64
tauxchoisies     float64
categorie          int64
dtype: object

Split Train / Test

In [23]:
train = df[0:int(len(df)*0.8)]
print(len(train))
test = df[int(len(df)*0.8)+1:]
print(len(test))

train_labels = train['categorie']
train = train.drop(columns = ['categorie'])
train_data = train
test_labels = test['categorie']
test = test.drop(columns = ['categorie'])
test_data = test

test_pool = Pool(test_data, 
                 test_labels,
                 cat_features = ['titre', 'artiste'])

75296
18823


Entrainement

In [24]:
model = CatBoostClassifier(iterations=10,
                           depth=10,
                           learning_rate=1,
                           loss_function='MultiClass',
                           verbose=True,
                           class_weights = [0.025, 0.025, 0.025, 0.025, 0.025, 0.425, 0.425, 0.025])
# train the model
model.fit(train_data, train_labels, cat_features = ['titre', 'artiste'])

0:	learn: 0.3327647	total: 729ms	remaining: 6.56s
1:	learn: 6.5389543	total: 1.47s	remaining: 5.87s
2:	learn: 41.4145241	total: 2.16s	remaining: 5.04s
3:	learn: 40.9567222	total: 2.8s	remaining: 4.2s
4:	learn: 38.2374120	total: 3.53s	remaining: 3.53s
5:	learn: 35.7111703	total: 4.36s	remaining: 2.9s
6:	learn: 31.6559397	total: 5.18s	remaining: 2.22s
7:	learn: 29.5200391	total: 6.13s	remaining: 1.53s
8:	learn: 28.6094637	total: 6.99s	remaining: 776ms
9:	learn: 26.7708844	total: 7.99s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1fa56d771c8>

In [25]:
# make the prediction using the resulting model
preds_class = model.predict(test_pool)
preds_proba = model.predict_proba(test_pool)

In [134]:
model.feature_names_

['titre',
 'année',
 'decennie',
 'artiste',
 'clusterid',
 'deltadate',
 'deltadatemc',
 'deltadatemcma',
 'tauxchoisies']

In [26]:
model.feature_importances_

array([0.00000000e+00, 4.25729136e+00, 5.28748881e-06, 0.00000000e+00,
       2.37946114e+01, 7.47070647e+00, 2.60387801e+00, 2.03597102e+01,
       4.15137973e+01])

Sauvegarde du modèle

In [27]:
model.save_model("catboostModel.cbm",
           format="cbm",
           export_parameters=None,
           pool=None)

In [28]:
test_data["pred"] = preds_class
test_data["labels"] = test_labels
test_data[test_data['pred'] != 8].sample(10)

Unnamed: 0,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,tauxchoisies,pred,labels
87190,Je viens du sud,1981,1980,Sardou Michel,0,155.0,273.0,119.0,0.333333,7,8
92087,Le chant des sirènes,2014,2010,Fréro Delavega,3,51.0,,,0.875,1,8
77540,Scandale dans la famille,1965,1960,Les Surfs,1,88.0,,,0.5,7,8
81623,Besoin de personne,1972,1970,Sanson Véronique,2,127.0,74.0,74.0,0.666667,7,8
85098,La Seine,2011,2010,Paradis Vanessa,2,161.0,87.0,76.0,0.8,7,8
79431,Ca va ça va,2016,2010,Capéo Claudio,2,189.0,91.0,33.0,0.666667,7,8
90638,Elle a fait un bébé toute seule,1987,1980,Goldman Jean-Jacques,1,402.0,96.0,51.0,1.0,6,8
90150,Je suis de celles,2003,2000,Bénabar,0,48.0,,87.0,0.4,7,8
83703,"Besoin de rien, envie de toi",1984,1980,Peter et Sloane,3,226.0,98.0,98.0,1.0,7,8
79781,Voilà l'été,1989,1980,Les Négresses Vertes,4,40.0,,,0.166667,7,8


In [29]:
import sklearn.metrics as skl

skl.confusion_matrix(test_data['labels'], test_data['pred'])

array([[    0,     0,     0,     0,     0,     0,     2,    35],
       [    0,     0,     0,     0,     0,     0,     2,    34],
       [    0,     0,     0,     0,     0,     0,     3,    31],
       [    0,     0,     0,     0,     0,     1,     0,    33],
       [    0,     0,     0,     0,     0,     0,     0,     2],
       [    0,     0,     0,     0,     0,     0,     1,    18],
       [    0,     0,     0,     0,     0,     0,     1,    36],
       [   17,     1,     2,     0,     0,   253,   367, 17984]],
      dtype=int64)

On regarde les chansons qui d'après CatBoost avaient le plus de chances de tomber en même chanson

In [30]:
test_data['probaMC'] = preds_proba[:,-3]
test_data['probaPassePas'] = preds_proba[:,-1]
test_data.sort_values(by = 'probaMC', ascending = False)[1:20]

Unnamed: 0,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,tauxchoisies,pred,labels,probaMC,probaPassePas
77614,Argent trop cher,1980,1980,Téléphone,4,35.0,,68.0,0.8,6,8,0.999994,7.589567e-07
78205,Cœur de rocker,1982,1980,Clerc Julien,4,35.0,,61.0,0.6,6,8,0.999994,7.589567e-07
84451,Pourvu qu'elles soient douces,1988,1980,Farmer Mylène,4,35.0,,62.0,0.666667,6,8,0.999994,7.589567e-07
86622,Il suffira d'un signe,1981,1980,Goldman Jean-Jacques,3,35.0,,49.0,0.8,6,8,0.999994,8.085634e-07
93209,L'école est finie,1963,1960,Sheila,4,35.0,,52.0,0.833333,6,8,0.999991,1.420458e-06
93526,Dans les yeux d'Emilie,1977,1970,Dassin Joe,3,35.0,,49.0,0.571429,6,8,0.999991,1.513301e-06
82654,Pas d'ami (comme toi),1991,1990,Eicher Stephan,4,35.0,,45.0,0.571429,6,8,0.99999,1.021686e-06
84210,Encore et encore,1985,1980,Cabrel Francis,3,35.0,,33.0,0.833333,6,8,0.999988,3.085708e-06
89527,J'veux du soleil,1991,1990,Au P'tit Bonheur,3,35.0,,,0.714286,6,8,0.999987,3.322295e-06
92740,Ma gueule,1979,1970,Hallyday Johnny,4,35.0,,32.0,0.666667,6,8,0.999984,3.267805e-06


Pareil avec les chansons qui ont le plus de chances de tomber pour 20k

In [31]:
test_data['proba20k'] = preds_proba[:,-2]
test_data['probaPassePas'] = preds_proba[:,-1]
test_data.sort_values(by = 'proba20k', ascending = False)[1:20]

Unnamed: 0,titre,année,decennie,artiste,clusterid,deltadate,deltadatemc,deltadatemcma,tauxchoisies,pred,labels,probaMC,probaPassePas,proba20k
87527,Tout l'or des hommes,2003,2000,Dion Céline,0,55.0,,45.0,0.2,7,8,1.529706e-31,5.117831e-42,1.0
81004,La mouche,1972,1970,Polnareff Michel,0,134.0,,84.0,0.5,7,8,6.4919e-36,1.998626e-65,1.0
81002,Holidays,1972,1970,Polnareff Michel,0,197.0,,84.0,0.5,7,8,1.1611019999999999e-35,5.109843e-65,1.0
77427,Tout l'or des hommes,2003,2000,Dion Céline,0,49.0,,39.0,0.2,7,8,1.529706e-31,5.117831e-42,1.0
85839,Rodéo,2004,2000,Zazie,0,47.0,,60.0,0.0,7,8,2.0742320000000002e-23,3.96933e-34,1.0
81402,Le jour d'après,2004,2000,Badi Chimène,0,45.0,,57.0,0.0,7,8,2.0742320000000002e-23,3.96933e-34,1.0
80052,Y'a une fille qu'habite chez moi,2001,2000,Bénabar,0,35.0,,81.0,0.285714,7,8,5.624972e-24,2.488387e-43,1.0
93925,Rodéo,2004,2000,Zazie,0,51.0,,64.0,0.0,7,8,9.272586e-24,3.2996519999999998e-34,1.0
82269,Seul au monde,2004,2000,Corneille,0,45.0,,57.0,0.4,7,8,2.0742320000000002e-23,3.96933e-34,1.0
85506,Tout l'or des hommes,2003,2000,Dion Céline,0,54.0,,44.0,0.2,7,8,1.529706e-31,5.117831e-42,1.0


## 4. Prédiction sur un nouveau dataset

In [32]:
import util
dateSimule = "2022-06-25"
# Requête
print("Querying data...")
conn, cur = util.connexion()
df = pd.read_sql_query(
    "SELECT * FROM public.\"GenereDatasetClassif\"('{0}', ('{0}'::date - INTERVAL'30 day')::date) WHERE \"année\" <= {1}".format(dateSimule, dateSimule[0:4]), con=conn)


Querying data...


In [33]:
# Data processing
print("Processing data...")
df = df.drop(columns=['id', 'Chanson_id'])
reversed_cat = {'50': 1, '40': 2, '30': 3,
                '20': 4, '10': 5, 'MC': 6, '20k': 7, None: 8}
df = df.replace({'categorie': reversed_cat})
test_labels = df['categorie']
test_data = df.drop(columns=['categorie'])
test_pool = Pool(test_data,
                    test_labels,
                    cat_features=['titre', 'artiste'])

Processing data...


In [36]:
# Chargement du modèle
print("Model loading...")
from_file = CatBoostClassifier()
model = from_file.load_model("catboostModel.cbm", format="cbm")

Model loading...


In [38]:
# Prédiction
print("Predicting probabilities...")
test_data["pred"] = model.predict(test_pool)
preds_proba = model.predict_proba(test_pool)
test_data["proba50"] = preds_proba[:,0]
test_data["proba40"] = preds_proba[:,1]
test_data["proba30"] = preds_proba[:,2]
test_data["proba20"] = preds_proba[:,3]
test_data["proba10"] = preds_proba[:,4]
test_data["probaMC"] = preds_proba[:,5]
test_data["proba20k"] = preds_proba[:,6]
test_data["probaPP"] = preds_proba[:,7]

Predicting probabilities...
