In [12]:
import os

import mysql.connector
import pandas as pd

from datetime import datetime

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [13]:
start = datetime.now()
print("start = ", start)
try:
    
    mydb = mysql.connector.connect(
        host=os.environ['MYSQL_SPOTIFY_HOST'],
        user=os.environ['MYSQL_SPOTIFY_USER'],
        password=os.environ['MYSQL_SPOTIFY_PW'],
        database="wilts_songs"
    )

    mycursor = mydb.cursor()
    mycursor.execute("""SELECT * FROM songs;""")

    data = mycursor.fetchall()
except mysql.connector.Error as error:
    print("Failed to insert into MySQL table {}".format(error))

finally:
    if mydb.is_connected():
        mycursor.close()
        mydb.close()
        print("MySQL connection is closed")
    end = datetime.now()
    print("finish = ", end)
    print("duration = ", (end-start).total_seconds())

start =  2022-03-14 14:38:04.877170
MySQL connection is closed
finish =  2022-03-14 14:38:04.956416
duration =  0.079246


In [14]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [15]:
columns = ['id', 'track_name', 'artist_name', 'popularity', 'duration_ms',
        'danceability', 'time_signature', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo']

nominal_cols = ['id', 'track_name', 'artist_name']
discret_cols = ['key', 'mode']
continuous_cols = ['popularity', 'duration_ms', 'danceability', 'time_signature', 'loudness', 'speechiness',
        'instrumentalness', 'liveness', 'valence', 'tempo', 'acousticness*energy'] 
useful_cols = discret_cols + continuous_cols

In [16]:
df = pd.DataFrame(data, columns=columns)

In [17]:
df['acousticness*energy']=df['acousticness']*df['energy']

On retire les colonnes energy et acousticness des colonnes utilisées pour le modèle et on ajoute acousticness*energy vu les conclusions de l'analyse de la donnée.

In [18]:
df=df[useful_cols]

In [19]:
df.describe()

Unnamed: 0,key,mode,popularity,duration_ms,danceability,time_signature,loudness,speechiness,instrumentalness,liveness,valence,tempo,acousticness*energy
count,18209.0,18209.0,18209.0,18209.0,18209.0,18209.0,18209.0,18209.0,18209.0,18209.0,18209.0,18209.0,18209.0
mean,5.239167,0.70542,53.534681,234924.2,0.570275,3.913065,-9.506448,0.071309,0.097785,0.198077,0.560166,120.248642,0.134408
std,3.521069,0.455866,19.853799,90384.62,0.166637,0.371238,4.475365,0.079428,0.244145,0.173292,0.251719,28.945378,0.129817
min,0.0,0.0,0.0,30622.0,0.0,0.0,-47.07,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,47.0,181133.0,0.458,4.0,-11.93,0.033,0.0,0.0918,0.362,97.867,0.02038
50%,5.0,1.0,57.0,223079.0,0.578,4.0,-8.755,0.0422,4.9e-05,0.129,0.569,119.397,0.101432
75%,8.0,1.0,67.0,270333.0,0.689,4.0,-6.275,0.0675,0.0091,0.252,0.771,136.837,0.215344
max,11.0,1.0,100.0,1644773.0,0.988,5.0,0.899,0.954,1.0,0.997,0.993,243.372,0.877184


In [20]:
numeric_pipeline = Pipeline(steps=[
    ('scale', MinMaxScaler())
])

In [21]:
full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, useful_cols),
])



In [32]:
from sklearn.cluster import AffinityPropagation, KMeans, Birch

randomf = Birch()

randomf_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', randomf)
])



Choix modèle: https://scikit-learn.org/stable/modules/clustering.html  
Gridsearch avec unsupervised: https://stackoverflow.com/questions/44636370/scikit-learn-gridsearchcv-without-cross-validation-unsupervised-learning

In [28]:
from sklearn import metrics
def cv_silhouette_scorer(estimator, X):
    estimator.fit(X)
    cluster_labels = estimator['model'].labels_
    num_labels = len(set(cluster_labels))
    num_samples = len(X.index)
    if num_labels == 1 or num_labels == num_samples:
        return -1
    else:
        return metrics.silhouette_score(X, cluster_labels)

cv = [(slice(None), slice(None))]
# gs = GridSearchCV(estimator=sklearn.cluster.MeanShift(), param_grid=param_dict, 
#                   scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1)
# gs.fit(df[cols_of_interest])

In [34]:

from sklearn.model_selection import GridSearchCV

param_dict_affinity = {'model__damping': [0.5], 'model__max_iter': [200]}
param_dict_kmeans = {'model__n_clusters': [50, 15, 20, 1], 'model__max_iter': [300]}
param_dict_birch = {'model__n_clusters': [None], 'model__branching_factor': [25, 50, 75, 100]}

search = GridSearchCV(randomf_pipeline, param_dict_birch, 
                      cv=cv, 
                      scoring=cv_silhouette_scorer)

_ = search.fit(df)

print('Best score:', abs(search.best_score_))

print('Best params:', search.best_params_)



Best score: 0.5705312116938209
Best params: {'model__branching_factor': 50, 'model__n_clusters': None}


In [45]:
search.__dict__

{'scoring': <function __main__.cv_silhouette_scorer(estimator, X)>,
 'estimator': Pipeline(steps=[('preprocess',
                  ColumnTransformer(transformers=[('number',
                                                   Pipeline(steps=[('scale',
                                                                    MinMaxScaler())]),
                                                   ['key', 'mode', 'popularity',
                                                    'duration_ms',
                                                    'danceability',
                                                    'time_signature', 'loudness',
                                                    'speechiness',
                                                    'instrumentalness',
                                                    'liveness', 'valence',
                                                    'tempo',
                                                    'acousticness*energy'])])),
         

In [48]:
import numpy as np

Nombre de clusters : 

In [50]:
len(np.unique(search.predict(df)))

74

Enregistrer les résultats du modele sur mysql pour faire des suggestion à l'utilisateur ensuite ?   
Lire l'extrait sur l'app de la chanson comparée, et éventuellement des suggestions  
Collecter info sur app : satisfait ou non de la conclusion du bot pour éventuellement faire du ml pour la partie 2 plutôt que des stats 

In [53]:

import pickle
filename = 'birch5705+74.sav'
pickle.dump(search, open(filename, 'wb'))



In [43]:
dir(search)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'feature_names_in_',
 'fit',
 'get_params',
 'inverse_transform',
 'multim

In [51]:
search.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=[(slice(None, None, None), slice(None, None, None))],
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(transformers=[('number',
                                                                         Pipeline(steps=[('scale',
                                                                                          MinMaxScaler())]),
                                                                         ['key',
                                                                          'mode',
                                                                          'popularity',
                                                                          'duration_ms',
                                                                          'danceability',
                                                                          'time_signature',
                     

In [42]:
for meth in dir(search):
    print(meth)
    print('----------')

__abstractmethods__
----------
__class__
----------
__delattr__
----------
__dict__
----------
__dir__
----------
__doc__
----------
__eq__
----------
__format__
----------
__ge__
----------
__getattribute__
----------
__getstate__
----------
__gt__
----------
__hash__
----------
__init__
----------
__init_subclass__
----------
__le__
----------
__lt__
----------
__module__
----------
__ne__
----------
__new__
----------
__reduce__
----------
__reduce_ex__
----------
__repr__
----------
__setattr__
----------
__setstate__
----------
__sizeof__
----------
__str__
----------
__subclasshook__
----------
__weakref__
----------
_abc_impl
----------
_check_feature_names
----------
_check_n_features
----------
_check_refit_for_multimetric
----------
_estimator_type
----------
_format_results
----------
_get_param_names
----------
_get_tags
----------
_more_tags
----------
_pairwise
----------
_repr_html_
----------
_repr_html_inner
----------
_repr_mimebundle_
----------
_required_parameters


https://www.kaggle.com/fk0728/feature-engineering-with-sklearn-pipelines

In [None]:
>>> import numpy as np
>>> from sklearn.cluster import KMeans
>>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
>>> labels = kmeans_model.labels_
>>> metrics.silhouette_score(X, labels, metric='euclidean')

In [None]:
model_pipeline = Pipeline(steps=[
  ("features", FeatureUnion([
    (
      "numerical_features",
      ColumnTransformer([
        (
          "numerical",
          Pipeline(steps=[(
            "impute_stage",
            SimpleImputer(missing_values=np.nan, strategy="median",)
          )]),
          ["feature_1"]
        )
      ])
    ), (
      "categorical_features",
      ColumnTransformer([
        (
          "country_encoding",
          Pipeline(steps=[
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
            ("reduction", NMF(n_components=8)),
          ]),
          ["country"],
        ),
      ])
    ), (
      "text_features",
      ColumnTransformer([
        (
          "title_vec",
          Pipeline(steps=[
            ("tfidf", TfidfVectorizer()),
            ("reduction", NMF(n_components=50)),
          ]),
          "title"
        )
      ])
    )
  ])),
  ("classifiers", RandomForestClassifier())
])

model_pipeline.fit(train_data, train_labels.values)
predictions = model_pipeline.predict(predict_data)


In [None]:

def trans_func(input_series):
    return output_series

from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(trans_func)

sk_pipe = Pipeline([("trans", transformer), ("vect", tf_1k), ("clf", clf_1k)])
sk_pipe.fit(train.desc, train.tag)

# where vect is a tf_idf transformer, 
# clf is a classifier and train is the training dataset. 
# "train.desc" is the series text input to the pipeline.

In [None]:
def trans_func(input_series):
    return output_series

from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(trans_func)

In [None]:
    
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

