In [3]:
import numpy as np # linear algebra
import pandas as ps # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime,timedelta
from sklearn.metrics import mean_squared_error
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("./data"))

# Any results you write to the current directory are saved as output.

['conso_train.csv', 'meteo_train.csv', 'meteo_prev.csv', 'sample_solution.csv']


In [4]:
def parse_model(X,targetColumnsName, useColumns):
    if targetColumnsName not in X.columns:
        raise ValueError("target column survived should belong to df")
    target = X[targetColumnsName]
    X = X[useColumns]
    return X, target

In [5]:
# Colonne/features à utiliser
modelCols_0 =[]

In [6]:
# calcul du RMSE
def RMSE(y_test,y_prediction):
        return np.sqrt(((y_prediction - y_test) ** 2).mean())

In [7]:
# Methode de modelisation 
# modelAlgo => l'algo (ex : randomForest, regressionLogstic)
# dfDate : le dataframe concerné par la modelisation
# features : un array contenant les colonnes/features à utiliser pendant la modelisation
# targerName : le nom de la target

## return => le score|RMSE
def modelisationProcessing(modelAlgo,dfData,features,targetName):
    X, y = parse_model(dfData.copy(),targetName, features)
    list_test_size = [a/20.0 for a in list(range(0,20,1))][1:]
    scores = []
    for ts in list_test_size:
        X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
        modelAlgoFitted = modelAlgo.fit(X_train,y_train)
        scores.append(modelAlgoFitted.score(X_test,y_test))
    # calcul du RMSE
    y_pred = modelAlgoFitted.predict(X_test)
    rmse = RMSE(y_test,y_pred)
    return np.array(scores).mean(),rmse

In [10]:
dfMeteoTrain = ps.read_csv('./data/meteo_train.csv',sep=';')
dfMeteoTrain.head(5)

Unnamed: 0,Date UTC,T¬∞ (C),P (hPa),HR (%),P.ros√©e (¬∞C),Visi (km),Vt. moy. (km/h),Vt. raf. (km/h),Vt. dir (¬∞),RR 3h (mm),Neige (cm),Nebul. (octats)
0,13/09/15 00h00,12.5,1008.7,81.0,9.3,40.0,9.26,18.52,140.0,0.0,,8.0
1,13/09/15 03h00,12.3,1006.4,83.0,9.5,40.0,11.112,16.668,120.0,0.0,,8.0
2,13/09/15 06h00,12.3,1004.7,82.0,9.3,40.0,14.816,22.224,130.0,,,7.0
3,13/09/15 09h00,14.2,1002.9,80.0,10.8,40.0,18.52,31.484,140.0,,,7.0
4,13/09/15 12h00,13.3,1000.8,93.0,12.2,4.0,18.52,38.892,140.0,4.0,,7.0


In [11]:
# renomage des colonnes
dfMeteoTrain.rename(columns={'T¬∞ (C)':'Temp_C','P (hPa)':'Pression_hPa','HR (%)':'HR_%','P.ros√©e (¬∞C)':'P_ros','Visi (km)':'Visi_km','Vt. moy. (km/h)':'Vt_moy_kmh','Vt. raf. (km/h)':'Vt_raf_kmh','Vt. dir (¬∞)':'Vt_dir','RR 3h (mm)':'RR_3h','Neige (cm)':'Neige_cm','Nebul. (octats)':'Nebul_octats'}, inplace=True)
dfMeteoTrain.columns

Index(['Date UTC', 'Temp_C', 'Pression_hPa', 'HR_%', 'P_ros', 'Visi_km',
       'Vt_moy_kmh', 'Vt_raf_kmh', 'Vt_dir', 'RR_3h', 'Neige_cm',
       'Nebul_octats'],
      dtype='object')

## traitement des valeurs null

In [12]:
# traitement des valeurs null
dfValManquante = ps.DataFrame(dfMeteoTrain.isna().sum())
dfValManquante = dfValManquante.T
dfValManquante

Unnamed: 0,Date UTC,Temp_C,Pression_hPa,HR_%,P_ros,Visi_km,Vt_moy_kmh,Vt_raf_kmh,Vt_dir,RR_3h,Neige_cm,Nebul_octats
0,0,64,64,64,64,64,64,67,64,336,1957,409


In [15]:
# replacement des valeur null => imputation
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
dfMeteoTrain[
    ['Temp_C','Pression_hPa','HR_%','P_ros','Visi_km','Vt_moy_kmh','Vt_raf_kmh','Vt_dir','RR_3h','Neige_cm','Nebul_octats']
] = imputer.fit_transform(dfMeteoTrain[['Temp_C','Pression_hPa','HR_%','P_ros','Visi_km','Vt_moy_kmh','Vt_raf_kmh','Vt_dir','RR_3h','Neige_cm','Nebul_octats']])
dfMeteoTrain

ModuleNotFoundError: No module named 'sklearn.impute'

In [None]:
dfValManquante = ps.DataFrame(dfMeteoTrain.isna().sum())
dfValManquante = dfValManquante.T
dfValManquante

In [None]:
# conversion de date UTC en date time
dfMeteoTrain['Date UTC'] = dfMeteoTrain['Date UTC'].apply(lambda x: datetime.strptime(x,'%d/%m/%y %Hh%M'))
dfMeteoTrain['Date UTC'] = dfMeteoTrain['Date UTC'].apply(lambda x: x.strftime('%d-%m-%y %H:%M'))
dfMeteoTrain['Date UTC'] = dfMeteoTrain['Date UTC'].apply(lambda x: datetime.strptime(x,'%d-%m-%y %H:%M'))
dfMeteoTrain.head(5)

## Dum du numéro de jour :
- 0 => dimanche
- 1 => lundi 
etc ..

In [None]:
dfDumDateDay = ps.DataFrame(dfMeteoTrain['Date UTC'].apply(lambda x: x.weekday()))
dfDumDateDay.rename(columns={'Date UTC': 'DateDumDay'}, inplace=True)
dfMeteoTrain = dfMeteoTrain.join(dfDumDateDay)
dfMeteoTrain.head(5)

## Dum du numéro de mois :
- 1 => janvier
- 2 => fevrier
etc ..

In [None]:
dfDumDateMonth = ps.DataFrame(dfMeteoTrain['Date UTC'].apply(lambda x: x.month))
dfDumDateMonth.rename(columns={'Date UTC': 'DateDumMonth'}, inplace=True)
dfMeteoTrain = dfMeteoTrain.join(dfDumDateMonth)
dfMeteoTrain.head(5)

In [None]:
modelCols_0 =['Pression_hPa','HR_%','P_ros','Vt_moy_kmh','Vt_dir','DateDumDay','DateDumMonth','Nebul_octats']

In [None]:
# split train/test
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

## etude du fichier conso

In [None]:
dfConso = ps.read_csv('../input/conso_train.csv')
dfConso.head(10)

In [None]:
# arroundie de la date à l'heure la plus proche
def hour_rounder(t):
    return (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
               +timedelta(hours=t.minute//30))
def standardiserDate(df,col,nouvelleCol):
    #dfTmp = df[col].apply(lambda x: x.replace())
    df[nouvelleCol] = df[col].apply(lambda x: x.replace("+02:00",""))
    df[nouvelleCol] = df[nouvelleCol].apply(lambda x: x.replace("+01:00",""))
    df[nouvelleCol] = df[nouvelleCol].apply(lambda x: x.replace("T"," "))
    df[nouvelleCol] = df[nouvelleCol].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
    df[nouvelleCol] = df[nouvelleCol].apply(lambda x: hour_rounder(x))
    # mise en UTC
    df[nouvelleCol] = df[nouvelleCol].apply(lambda x: x - timedelta(hours=2))
    #df[nouvelleCol] = df[nouvelleCol].apply(lambda t: t.strftime('%d-%m-%y %H:%M'))
    #df[nouvelleCol] = df[nouvelleCol].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M'))                                            
standardiserDate(dfConso,'date','dateStd')
dfConso.head(10)

In [None]:
# renomage de la col date de dfConso
dfConso.rename(columns={'date':'dateOriginalConso'},inplace=True)
dfConso.head()

## merge des 2 df (en copie) dfMeteoTrain et dfConso sur col Date
## [Date UTC] et [dateStd]
## Etude de la consommation toute les trois heures


In [None]:
dfMeteoTrainCopy = dfMeteoTrain.copy()
dfConsoCopy = dfConso.copy()
dfMeteoTrainCopy.rename(columns={'Date UTC':"dateStd"}, inplace=True)
dfMeteoTrainCopy.head(2)

In [None]:
dfMeteoTrainCopy.rename(columns={'Date UTC':"dateStd"})
dfMerger = ps.merge(dfMeteoTrainCopy,dfConsoCopy,on='dateStd')
dfMerger.head(5)

## Regression pour predire la consommation => Random Forest

In [None]:
modelCols_conso =['Temp_C',
                  'Pression_hPa',
                  'HR_%',
                  'P_ros',
                  'Vt_moy_kmh',
                  'Vt_raf_kmh',
                  'Visi_km',
                  'RR_3h',
                  'DateDumDay',
                  'DateDumMonth',
                  'Nebul_octats',
                  'Neige_cm']

In [None]:
rf_conso = RandomForestRegressor(n_estimators=150,max_depth=10)
targetName = 'puissance'
resScore, rmse = modelisationProcessing(rf_conso,dfMerger,modelCols_conso,targetName)

print("Resultat score : {} RMSE: {}".format(resScore, rmse))

# Best score => 0,76

## Etude de la consommation avec enregistrement toutes les heures

In [None]:
# preprocessing pour recuperer les enregistrements toutes les heures
def preprocessing_hours(df,dataColName):
    dfTmp = ps.DataFrame()
    for index,l in df.iterrows():       
        dfTmp = dfTmp.append(l)
        l[dataColName] = l[dataColName] + timedelta(hours=1)
        dfTmp = dfTmp.append(l)
        l[dataColName] = l[dataColName] + timedelta(hours=1)
        dfTmp = dfTmp.append(l)
    return dfTmp

In [None]:
dfEachHour = preprocessing_hours(dfMeteoTrainCopy.copy(),'dateStd')
dfEachHour.head(5)


In [None]:
# Construction du df merger toute les heures
dfMergerEachHours = ps.merge(dfEachHour,dfConsoCopy,on='dateStd')
dfMergerEachHours.shape

In [None]:
resScoreEachHours, rmse_each_hours = modelisationProcessing(rf_conso,dfMergerEachHours,modelCols_conso,targetName)

print("Resultat score : {} RMSE: {}".format(resScoreEachHours, rmse_each_hours))

## best score => 0.83

# comparaison supplementaire
## Regression pour predire la consommation => Regression lineaire

In [None]:
from sklearn.linear_model import LinearRegression
# df avec enregistrement toutes les 3 hours
lr_conso = LinearRegression()
resScoreLr, rmse_lr = modelisationProcessing(lr_conso,dfMerger,modelCols_conso,targetName)
print("Resultat score : {} RMSE: {}".format(resScoreLr, rmse_lr))

In [None]:
# df avec enregistrement toutes les heures 
resScoreLr_each_hours, rmse_lr_each_hours = modelisationProcessing(lr_conso,dfMergerEachHours,modelCols_conso,targetName)
print("Resultat score : {} RMSE: {}".format(resScoreLr_each_hours, rmse_lr_each_hours))

## Comparaison => on cherche l'erreur la plus basse

In [None]:
# test avec gradient boost
from sklearn.ensemble import GradientBoostingRegressor

gboost = GradientBoostingRegressor()
resScoreGBoost, rmse_gboost = modelisationProcessing(gboost,dfMerger,modelCols_conso,targetName)
print("Resultat score : {} RMSE: {}".format(resScoreGBoost, rmse_gboost))