In [1]:
#Install library
!pip install statsmodels



In [135]:
#Import libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [95]:
#We load the excel file
dataset = pd.read_excel('/content/sample_data/dataset_ms_ls.xlsx')

In [96]:
#We show the first 5 rows of the dataset
dataset.head()

Unnamed: 0,Artistes,songs,url,id,image_path,Streamed/Non-Streamed,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,Natalie Imbruglia,On My Way,https://www.youtube.com/watch?v=hhnBDkgLJPg,0DEJ9GEuWuGFMR1GhPORnY,On My Way.png,0,0.633,0.69,7,-4.397,1,0.0298,0.0246,5e-06,0.131,0.631,103.003,4
1,OFFICIAL HIGE DANDISM,Pretender,https://www.youtube.com/watch?v=MbhyzQiRFXY,15HNdxGKNCIO9pgaY4n7FU,Pretender (歌詞) official髭男dism.png,1,0.538,0.869,8,-3.464,1,0.0275,0.047,0.0,0.14,0.369,91.972,4
2,Bryan Adams,Summer Of '69,https://www.youtube.com/watch?v=eFjjO_lhf9c,0GONea6G2XdnHWjNZd6zt3,Summer Of 69.png,1,0.509,0.834,2,-6.205,1,0.0386,0.0183,0.0,0.0732,0.774,139.13,4
3,Fuse ODG,What it do,https://www.youtube.com/watch?v=gTyLD5gHAbg,2V3mQTPHYkxbkYxXMjnKFA,What It Do.png,0,0.834,0.878,7,-3.369,1,0.12,0.00313,0.149,0.246,0.315,102.043,4
4,Édith Piaf,La vie en rose,https://www.youtube.com/watch?v=rzeLynj1GYM,4FmiciU3ZmfgABlbCSXcWw,La Vie en rose.png,1,0.39,0.302,8,-8.687,1,0.0321,0.984,0.0,0.0871,0.422,82.52,4


In [97]:
#We remove the irrelevant columns
dataset = dataset.drop(['Artistes','songs','url','id','image_path'], axis=1)
dataset.head()

Unnamed: 0,Streamed/Non-Streamed,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0,0.633,0.69,7,-4.397,1,0.0298,0.0246,5e-06,0.131,0.631,103.003,4
1,1,0.538,0.869,8,-3.464,1,0.0275,0.047,0.0,0.14,0.369,91.972,4
2,1,0.509,0.834,2,-6.205,1,0.0386,0.0183,0.0,0.0732,0.774,139.13,4
3,0,0.834,0.878,7,-3.369,1,0.12,0.00313,0.149,0.246,0.315,102.043,4
4,1,0.39,0.302,8,-8.687,1,0.0321,0.984,0.0,0.0871,0.422,82.52,4


**REGRESSION LOGISTIQUE / LOGISTIC REGRESSION**

In [98]:
#We specify all the continuous vairables
X_continuous = ['danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo']

In [99]:
#We specify all the categorical variables
X_categorical = ['key','mode','time_signature']

In [100]:
#We do a one hot encoding for all the categorical variables
df_one_hot = pd.get_dummies(dataset[X_categorical], drop_first=True)

In [101]:
#Target
y = dataset['Streamed/Non-Streamed']

In [102]:
#We combine both continuous variables and encoded variables
X = pd.concat([dataset[X_continuous], df_one_hot], axis=1)

In [103]:
#We divide the data to have a train data and a validation data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [104]:
#Logistic Regression Model
X_train = sm.add_constant(X_train)
model = sm.Logit(y_train, X_train)
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.654239
         Iterations 5


In [105]:
X_val = sm.add_constant(X_val)
y_pred_proba = result.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int)

In [134]:
print(result.summary())

                             Logit Regression Results                            
Dep. Variable:     Streamed/Non-Streamed   No. Observations:                  240
Model:                             Logit   Df Residuals:                      227
Method:                              MLE   Df Model:                           12
Date:                   Wed, 24 Jul 2024   Pseudo R-squ.:                 0.05608
Time:                           15:20:07   Log-Likelihood:                -157.02
converged:                          True   LL-Null:                       -166.35
Covariance Type:               nonrobust   LLR p-value:                   0.09710
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -1.8556      1.931     -0.961      0.337      -5.641       1.930
danceability        -0.1443      1.130     -0.128      0.898      -2.358       2.070
ener

In [106]:
# model evaluation
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

Accuracy: 0.5666666666666667
Confusion Matrix:
 [[18 13]
 [13 16]]
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.58      0.58        31
           1       0.55      0.55      0.55        29

    accuracy                           0.57        60
   macro avg       0.57      0.57      0.57        60
weighted avg       0.57      0.57      0.57        60



**RANDOM FOREST**

In [123]:
#We divide the data to have a train data and a validation data
X = dataset.drop(['Streamed/Non-Streamed'], axis=1)
y = dataset['Streamed/Non-Streamed']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [124]:
#We have to normalize the continuous variables, but since most of them are already normalized we wil normalize tempo
features_to_skip = ['danceability','speechiness','acousticness','instrumentalness','liveness','valence']
numerical_features_to_normalize  = ['energy', 'loudness', 'tempo']

In [125]:
# For normalization of numerical data
numerical_transformer = StandardScaler()

#One hot encoding for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine both num and cat transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features_to_normalize),
        ('num_to_skip', 'passthrough', features_to_skip),
        ('cat', categorical_transformer, X_categorical)
    ])

In [126]:
# Define the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a pipeline that first preprocesses the data and then fits the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

#We Fit the model
pipeline.fit(X_train, y_train)

#predictions
y_pred = pipeline.predict(X_val)

# Evaluate the model
report = classification_report(y_val, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.55      0.52      0.53        31
           1       0.52      0.55      0.53        29

    accuracy                           0.53        60
   macro avg       0.53      0.53      0.53        60
weighted avg       0.53      0.53      0.53        60



In [127]:
# Define parameter grid for Random Forest
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_val)
best_report = classification_report(y_val, y_pred_best)

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)
print("Classification Report:\n", best_report)

Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best Cross-Validation Score: 0.5666666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.52      0.53        31
           1       0.52      0.55      0.53        29

    accuracy                           0.53        60
   macro avg       0.53      0.53      0.53        60
weighted avg       0.53      0.53      0.53        60



# **INTERPRETATION**

On peut clairement dire que les deux modèles ont une précision très faible.
La précision étant inférieur à 60%, on peut dire, un modèle avec des prédictions aléatoires auraient les mêmes résultats (après tout, les données sont reparties également, 50% pour les streamés, et 50% pour les non-streamés) / Clearly, both models have very low accuracy.
As the accuracy is less than 60%, we can say that a model with random predictions would have the same results (after all, the data are equally divided, 50% for streamed, and 50% for non-streamed).