# Openclassrooms PJ4 : transats dataset : modelisation notebook

In [1]:
%matplotlib inline

import os
import zipfile
import urllib

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import qgrid

import glob

from pandas.plotting import scatter_matrix

SAMPLED_DATA = False  # If True : data is sampled (1000 instances only) for faster testing purposes

DATA_PATH = os.path.join("datasets", "transats")
DATA_PATH = os.path.join(DATA_PATH, "out")

DATA_PATH_FILE_INPUT = os.path.join(DATA_PATH, "transats_metadata_transformed.csv")

plt.rcParams["figure.figsize"] = [16,9] # Taille par défaut des figures de matplotlib

import seaborn as sns
sns.set()

#import common_functions

In [2]:
def qgrid_show(df):
    display(qgrid.show_grid(df, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 170}))

In [3]:
def print_column_information(df, column_name):
    column_type = df.dtypes[column_name]
    print(f'Column {column_name}, type {column_type}\n')
    print('--------------------------')

    print(df[[column_name]].groupby(column_name).size().sort_values(ascending=False))
    print(df[column_name].unique())    
    print('\n')


In [4]:
def display_percent_complete(df):
    not_na = 100 - (df.isnull().sum() * 100 / len(df))
    not_na_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_complete': not_na}).sort_values(by='percent_complete', ascending=False)
    display(not_na_df)

In [5]:
def identify_features(df, all_features):
    quantitative_features = []
    qualitative_features = []
    features_todrop = []

    for feature_name in all_features:
        if (df[feature_name].dtype == 'object'):
            qualitative_features.append(feature_name)

        else:
            quantitative_features.append(feature_name)

    print(f'Quantitative features : {quantitative_features} \n')
    print(f'Qualitative features : {qualitative_features} \n')  
    
    return quantitative_features, qualitative_features

# Data load

In [6]:
# hhmm timed features formatted
feats_hhmm = ['CRS_DEP_TIME',  'CRS_ARR_TIME']

df = pd.read_csv(DATA_PATH_FILE_INPUT, sep=',', header=0, encoding='utf-8', low_memory=False, parse_dates=feats_hhmm)

In [7]:
df.shape

(5547828, 14)

In [8]:
display_percent_complete(df)

Unnamed: 0,column_name,percent_complete
ORIGIN,ORIGIN,100.0
CRS_DEP_TIME,CRS_DEP_TIME,100.0
MONTH,MONTH,100.0
DAY_OF_MONTH,DAY_OF_MONTH,100.0
DAY_OF_WEEK,DAY_OF_WEEK,100.0
UNIQUE_CARRIER,UNIQUE_CARRIER,100.0
DEST,DEST,100.0
CRS_ARR_TIME,CRS_ARR_TIME,100.0
DISTANCE,DISTANCE,100.0
CRS_ELAPSED_TIME,CRS_ELAPSED_TIME,100.0


In [9]:
for column_name in df.columns:
    print_column_information(df, column_name)

Column ORIGIN, type object

--------------------------
ORIGIN
ATL    392037
ORD    240967
DEN    222012
LAX    210219
DFW    194370
        ...  
MMH        91
GST        83
DLG        83
PGD        78
AKN        63
Length: 308, dtype: int64
['BOS' 'JFK' 'LAX' 'DFW' 'OKC' 'OGG' 'HNL' 'SFO' 'ORD' 'MIA' 'IAH' 'DTW'
 'SEA' 'MSP' 'LGA' 'ATL' 'LAS' 'CLT' 'DCA' 'SAN' 'COS' 'PDX' 'TUS' 'SJC'
 'DEN' 'PHX' 'SNA' 'MCO' 'AUS' 'STL' 'KOA' 'MEM' 'SLC' 'PHL' 'LIH' 'MCI'
 'JAX' 'MSY' 'IAD' 'SJU' 'ORF' 'ABQ' 'FLL' 'IND' 'SAT' 'EWR' 'BWI' 'RDU'
 'TPA' 'ONT' 'TUL' 'BNA' 'SMF' 'DSM' 'RNO' 'DAY' 'BDL' 'FAT' 'OMA' 'MKE'
 'SDF' 'PIT' 'RSW' 'CMH' 'STT' 'STX' 'PBI' 'ELP' 'PSP' 'ICT' 'AMA' 'PNS'
 'CLE' 'XNA' 'MFE' 'RIC' 'HOU' 'OAK' 'JAC' 'EGE' 'PVD' 'BUF' 'ILM' 'SYR'
 'MDT' 'CHS' 'ALB' 'PWM' 'GSO' 'ROC' 'BOI' 'GEG' 'LBB' 'ANC' 'ADQ' 'BET'
 'BRW' 'SCC' 'FAI' 'SIT' 'JNU' 'KTN' 'CDV' 'YAK' 'WRG' 'PSG' 'OME' 'OTZ'
 'ADK' 'BUR' 'LGB' 'BTV' 'HPN' 'SRQ' 'SWF' 'DAB' 'SAV' 'ORH' 'ACK' 'MVY'
 'BQN' 'PSE' 'HYA' 'TLH' 'BH

# Identification of features

In [10]:
# Below are feature from dataset that we decided to keep: 
all_features = ['ORIGIN','CRS_DEP_TIME','MONTH','DAY_OF_MONTH','DAY_OF_WEEK','UNIQUE_CARRIER','DEST','CRS_ARR_TIME','DISTANCE','CRS_ELAPSED_TIME','ARR_DELAY','DEP_DELAY', 'TAXI_OUT', 'TAIL_NUM']

model1_features = ['ORIGIN','CRS_DEP_TIME','MONTH','DAY_OF_MONTH','DAY_OF_WEEK','UNIQUE_CARRIER','DEST','CRS_ARR_TIME','DISTANCE','CRS_ELAPSED_TIME']
model1_label = 'ARR_DELAY'

quantitative_features = []
qualitative_features = []
features_todrop = []

for feature_name in all_features:
    if (df[feature_name].dtype == 'object'):
        qualitative_features.append(feature_name)
        
    else:
        quantitative_features.append(feature_name)

print(f'Quantitative features : {quantitative_features} \n')
print(f'Qualitative features : {qualitative_features} \n')        
        

#Commented out : no drop of features
#for df_column in df.columns:
#    if df_column not in all_features:
#        features_todrop.append(df_column)
#        
#print(f'Features to drop : {features_todrop} \n')


Quantitative features : ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'CRS_ELAPSED_TIME', 'ARR_DELAY', 'DEP_DELAY', 'TAXI_OUT'] 

Qualitative features : ['ORIGIN', 'CRS_DEP_TIME', 'UNIQUE_CARRIER', 'DEST', 'CRS_ARR_TIME', 'TAIL_NUM'] 



# Split train set, test set

In [11]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
if (SAMPLED_DATA == True):
    df_train = df_train.sample(1000).copy(deep=True)
    df = df.loc[df_train.index]

In [13]:
df_train

Unnamed: 0,ORIGIN,CRS_DEP_TIME,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,DEST,CRS_ARR_TIME,DISTANCE,CRS_ELAPSED_TIME,ARR_DELAY,DEP_DELAY,TAXI_OUT,TAIL_NUM
36771,LAX,2230,9,13,2,AA,IAD,0626,2288.0,296.0,-16.0,-6.0,15.0,N3HWAA
1070595,PHX,0945,6,1,3,WN,BUR,1110,369.0,85.0,-9.0,-3.0,11.0,N216WR
4669404,MKE,1220,11,28,1,DL,DTW,1432,237.0,72.0,-8.0,-3.0,12.0,N985DL
5266394,ECP,0605,8,10,3,EV,IAH,0757,572.0,112.0,-9.0,-5.0,18.0,N15986
2209289,ORD,1550,12,14,3,UA,MSP,1730,334.0,100.0,4.0,16.0,17.0,N39423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1570006,DEN,1355,5,7,6,WN,LAX,1520,862.0,145.0,13.0,5.0,16.0,N455WN
2234489,SFO,1725,12,13,2,VX,DAL,2250,1476.0,205.0,-12.0,-5.0,11.0,N625VA
4926484,IAH,1015,11,11,5,UA,SJU,1630,2007.0,255.0,-5.0,1.0,20.0,N67812
4304572,IAH,1950,10,4,2,UA,DFW,2115,224.0,85.0,-15.0,-2.0,18.0,N834UA


In [14]:
#df = df.loc[df_train.index]

# Features encoding

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn import decomposition
from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer

import statistics


'''
Cette fonction fait un 1 hot encoding des features qui sont des catégories
Elle fonctionne pour les 2 cas de figure suivant :
- Les valeurs possibles de la colonne sont une chaîne de caractère (ex : cat1)
- Les valeurs possibles de la colonne sont des chaînes de caractère avec des séparateurs (ex:  cat1|cat2|cat3)
'''
    
def add_categorical_features_1hot(df, categorical_features_totransform):
    #df.drop(labels=categorical_features_totransform, axis=1, inplace=True)
    
    for feature_totransform in categorical_features_totransform:
        print(f'Adding 1hot Feature : {feature_totransform}')
        
        df_transformed = df[feature_totransform].str.get_dummies().add_prefix(feature_totransform +'_')   
        #df.drop(labels=feature_totransform, axis=1, inplace=True)
        del df[feature_totransform]
        
        df = pd.concat([df, df_transformed], axis=1)
        
    return(df)


class HHMM_to_Minutes(BaseEstimator, TransformerMixin):
    def __init__(self, features_toconvert = ['CRS_DEP_TIME', 'CRS_ARR_TIME']):
        self.features_toconvert = features_toconvert
        return None
    
    def fit(self, df):      
        return self
    
    def transform(self, df):       
        for feature_toconvert in self.features_toconvert:
            df_concat = pd.concat([df[feature_toconvert].str.slice(start=0,stop=2, step=1),df[feature_toconvert].str.slice(start=2,stop=4, step=1)], axis=1).astype(int)
            df[feature_toconvert] = (df_concat.iloc[:, [0]] * 60 + df_concat.iloc[:, [1]])[feature_toconvert]
        
        return(df)

        
class CategoricalFeatures1HotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_features_totransform=['ORIGIN', 'UNIQUE_CARRIER', 'DEST']):
        self.categorical_features_totransform = categorical_features_totransform
    
    def fit(self, df, labels=None):      
        return self
    
    def transform(self, df):       
        return(add_categorical_features_1hot(df, self.categorical_features_totransform))

class FeaturesSelector(BaseEstimator, TransformerMixin):
    def __init__(self, features_toselect = None):  # If None : every column is kept, nothing is done
        self.features_toselect = features_toselect
    
    def fit(self, df, labels=None):      
        return self
    
    def transform(self, df):       
        if (self.features_toselect != None):
            filter_cols = [col for col in df if (col.startswith(tuple(self.features_toselect)))]
            return(df[filter_cols])    

        else:
            return(df)
    
preparation_pipeline = Pipeline([
    ('data_converter', HHMM_to_Minutes()),
    ('categoricalfeatures_1hotencoder', CategoricalFeatures1HotEncoder()),
    #('standardscaler', preprocessing.StandardScaler()),
])


prediction_pipeline = Pipeline([
    ('features_selector', FeaturesSelector(features_toselect=['ORIGIN','CRS_DEP_TIME','MONTH','DAY_OF_MONTH','DAY_OF_WEEK','UNIQUE_CARRIER','DEST','CRS_ARR_TIME','DISTANCE','CRS_ELAPSED_TIME'])),
    ('standardscaler', ColumnTransformer([
        ('standardscaler_specific', StandardScaler(), ['CRS_DEP_TIME','MONTH','DAY_OF_MONTH', 'DAY_OF_WEEK', 'CRS_ARR_TIME', 'DISTANCE', 'CRS_ELAPSED_TIME'])
    ], remainder='passthrough')),
    #('predictor', To_Complete(predictor_params =  {'n_neighbors':6, 'algorithm':'ball_tree', 'metric':'minkowski'})),
])


'''
ColumnTransformer([
        ('standardscaler_specific', StandardScaler(), ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'CRS_ELAPSED_TIME', 'ARR_DELAY', 'DEP_DELAY', 'TAXI_OUT'])
    ], remainder='passthrough')
'''

"\nColumnTransformer([\n        ('standardscaler_specific', StandardScaler(), ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'CRS_ELAPSED_TIME', 'ARR_DELAY', 'DEP_DELAY', 'TAXI_OUT'])\n    ], remainder='passthrough')\n"

In [16]:
df_transformed = preparation_pipeline.fit_transform(df_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Adding 1hot Feature : ORIGIN


MemoryError: Unable to allocate array with shape (4438262, 308) and data type int64

In [None]:
df_transformed.shape

In [None]:
type(df_transformed)

In [None]:
df_transformed = prediction_pipeline.fit_transform(df_transformed)

In [None]:
df_transformed.shape

In [None]:
df_transformed

In [None]:
pd.set_option('display.max_columns', 400)

In [None]:
quantitative_features, qualitative_features = identify_features(df, all_features)

# Basic linear regression

In [None]:
df_transformed.shape

In [None]:
df.shape

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(df_transformed, df[model1_label])

In [None]:
from sklearn.metrics import mean_squared_error

df_predictions = lin_reg.predict(df_transformed)
lin_mse = mean_squared_error(df[model1_label], df_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

