In [117]:
#///////////////////////
# rregression linéaire //////////////////////
#///////////////////////////////////////////
import warnings
warnings.filterwarnings("ignore") # on empeche les warnings d'apparaitre
# 'chargement des librairies et pipelines'
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
# ************************************************
#  mise au bon format de toutes les variables
df=pd.read_csv('Data/allocineV9-SB.csv', sep=",", index_col = "ID")
df=df.astype({"actors_score":"float","directors_score":"float","scenaristes_score":"float","distrib_score":"float"
              ,"Action":"Int64","Animation":"Int64","Aventure":"Int64","Biopic":"Int64","Comedie":"Int64","Comedie dramatique":"Int64"
              ,"Comedie musicale":"Int64","Drame":"Int64","Epouvante-horreur":"Int64","Famille":"Int64","Fantastique":"Int64","Guerre":"Int64","Historique":"Int64","Musical":"Int64"
              ,"Policier":"Int64","Romance":"Int64","Science Fiction":"Int64","Thriller":"Int64"
              ,"Western":"Int64","jour_sortie":"Int64","mois_sortie":"Int64","duree":"Int64"})
# ***************************************************************
#  Séparer les données en un DataFrame fX contenant les variables explicatives et un dataframe y contenant la variable cible
X = df.drop('premiere_semaine_france', axis=1)
y = df['premiere_semaine_france']
# ***************************************************************
# Créer un ensemble d'entraînement et un ensemble de test correspondant respectivement à 75% et 25% des données.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
# distinction entre variables numérique et circulaires
num_cols=['actors_score', 'directors_score', 
        'scenaristes_score', 'distrib_score','Action', 'Animation', 'Aventure', 
        'Biopic', 'Comedie','Comedie dramatique','Comedie musicale', 'Drame', 'Epouvante-horreur',
        'Famille', 'Fantastique', 'Guerre', 'Historique','Musical', 'Policier','Romance', 
        'Science Fiction', 'Thriller', 'Western', 'France', 'USA','Angleterre', 'Autres','note_presse', 'duree']
rob_cols=['budget_dollars']
circular_cols=['jour_sortie', 'mois_sortie','day_of_week_sortie']
#
num_train=X_train[num_cols]
num_test=X_test[num_cols]
rob_train=X_train[rob_cols]
rob_test=X_test[rob_cols]
circular_train=X_train[circular_cols]
circular_test=X_test[circular_cols]
#********* encodage des variables cycliques
#'jour du mois de sortie
circular_train.loc[:,'sin_jour_sortie']=circular_train.loc[:,'jour_sortie'].apply(lambda h:np.sin(2*np.pi*h/31))
circular_train.loc[:,'cos_jour_sortie']=circular_train.loc[:,'jour_sortie'].apply(lambda h:np.cos(2*np.pi*h/31))
circular_test.loc[:,'sin_jour_sortie']=circular_test.loc[:,'jour_sortie'].apply(lambda h:np.sin(2*np.pi*h/31))
circular_test.loc[:,'cos_jour_sortie']=circular_test.loc[:,'jour_sortie'].apply(lambda h:np.cos(2*np.pi*h/31))

# 'num mois de sortie
circular_train.loc[:,'sin_mois_sortie']=circular_train.loc[:,'mois_sortie'].apply(lambda h:np.sin(2*np.pi*h/12))
circular_train.loc[:,'cos_mois_sortie']=circular_train.loc[:,'mois_sortie'].apply(lambda h:np.cos(2*np.pi*h/12))
circular_test.loc[:,'sin_mois_sortie']=circular_test.loc[:,'mois_sortie'].apply(lambda h:np.sin(2*np.pi*h/12))
circular_test.loc[:,'cos_mois_sortie']=circular_test.loc[:,'mois_sortie'].apply(lambda h:np.cos(2*np.pi*h/12))


# 'jour de la semaine de sortie
circular_train.loc[:,'sin_day_of_week_sortie']=circular_train.loc[:,'day_of_week_sortie'].apply(lambda h:np.sin(2*np.pi*h/7))
circular_train.loc[:,'cos_day_of_week_sortie']=circular_train.loc[:,'day_of_week_sortie'].apply(lambda h:np.cos(2*np.pi*h/7))
circular_test.loc[:,'sin_day_of_week_sortie']=circular_test.loc[:,'day_of_week_sortie'].apply(lambda h:np.sin(2*np.pi*h/7))
circular_test.loc[:,'cos_day_of_week_sortie']=circular_test.loc[:,'day_of_week_sortie'].apply(lambda h:np.cos(2*np.pi*h/7))
circular_train=circular_train.drop(['jour_sortie','mois_sortie','day_of_week_sortie'],axis=1)
circular_test=circular_test.drop(['jour_sortie','mois_sortie','day_of_week_sortie'],axis=1)

# normalisation des autres variables numériques sauf la variable budget (qui contient des outliers)
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(num_train)
X_test_norm = scaler.transform(num_test)
X_train_norm= pd.DataFrame(X_train_norm,columns=num_cols)#on recree un dataframe en vue du concat
X_test_norm= pd.DataFrame(X_test_norm,columns=num_cols) #on recree un dataframe en vue du concat
# Transformation en robustscalling de la variable budget
scaler = RobustScaler()
X_train_rob = scaler.fit_transform(rob_train)
X_test_rob = scaler.transform(rob_test)
X_train_rob= pd.DataFrame(X_train_rob,columns=rob_cols)#on recree un dataframe en vue du concat
X_test_rob= pd.DataFrame(X_test_rob,columns=rob_cols)#on recree un dataframe en vue du concat
# on concat les données transformées

print(circular_train.shape)
print(X_train_rob.shape)
print(X_train_norm.shape)
print(circular_train.shape)
X_train_encoded=pd.concat([X_train_norm,X_train_rob,circular_train.set_index(X_train_rob.index)],axis=1)
X_test_encoded=pd.concat([X_test_norm,X_test_rob,circular_test.set_index(X_test_rob.index)],axis=1)

# instantiation d'un modèle de g=regression lenéaire
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train_encoded, y_train)
coeffs = list(regressor.coef_)
coeffs.insert(0, regressor.intercept_)

print('Coefficient de détermination du modèle sur train:', regressor.score(X_train_encoded, y_train))
print('Coefficient de détermination du modèle sur test:', regressor.score(X_test_encoded, y_test))


(5163, 6)
(5163, 1)
(5163, 29)
(5163, 6)
Coefficient de détermination du modèle sur train: 0.42889439682871233
Coefficient de détermination du modèle sur test: 0.4318795343332982


In [None]:
import warnings
warnings.filterwarnings("ignore") # on empeche les warnings d'apparaitre
# 'chargement des librairies et pipelines'
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
# ************************************************
#  mise au bon format de toutes les variables
df=pd.read_csv('Data/allocineV9-SB.csv', sep=",", index_col = "ID")
df.head()

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
</body>
</html>

