# Openclassrooms PJ5 : Online Retail dataset :  modelisation notebook 

In [118]:
%matplotlib inline

#%load_ext autoreload  # Autoreload has a bug : when you modify function in source code and run again, python kernel hangs :(
#%autoreload 2

import datetime as dt

import sys, importlib

from functions import *
importlib.reload(sys.modules['functions'])

import pandas as pd

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

import datetime as dt

import os
import zipfile
import urllib

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import qgrid

import glob

from pandas.plotting import scatter_matrix

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import GridSearchCV

DATA_PATH = os.path.join("datasets", "onlineretail")
DATA_PATH = os.path.join(DATA_PATH, "out")

DATA_PATH_FILE_INPUT = os.path.join(DATA_PATH, "OnlineRetail_transformed.csv")


ALL_FEATURES = []

MODEL_FEATURES=['InvoiceNo', 'InvoiceDate', 'CustomerID', 'TotalPrice', 'DescriptionNormalized', 'InvoiceMonth']

plt.rcParams["figure.figsize"] = [16,9] # Taille par défaut des figures de matplotlib

import seaborn as sns
sns.set()

#import common_functions

####### Paramètres pour sauver et restaurer les modèles :
import pickle
####### Paramètres à changer par l'utilisateur selon son besoin :

RECOMPUTE_GRIDSEARCH = True  # CAUTION : computation is several hours long
SAVE_GRID_RESULTS = False # If True : grid results object will be saved to pickle files that have GRIDSEARCH_FILE_PREFIX
LOAD_GRID_RESULTS = False # If True : grid results object will be loaded from pickle files that have GRIDSEARCH_FILE_PREFIX
                          # Grid search results are loaded with full samples (SAMPLED_DATA must be False)

'''
RECOMPUTE_GRIDSEARCH = True  # CAUTION : computation is several hours long
SAVE_GRID_RESULTS = True # If True : grid results object will be saved to pickle files that have GRIDSEARCH_FILE_PREFIX
LOAD_GRID_RESULTS = False # If True : grid results object will be loaded from pickle files that have GRIDSEARCH_FILE_PREFIX
'''
#GRIDSEARCH_CSV_FILE = 'grid_search_results.csv'

GRIDSEARCH_FILE_PREFIX = 'grid_search_results_'

EXECUTE_INTERMEDIATE_MODELS = True # If True: every intermediate model (which results are manually analyzed in the notebook) will be executed


# Necessary for predictors used in the notebook :
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import PolynomialFeatures

### For progress bar :
from tqdm import tqdm_notebook as tqdm

# Statsmodel : 
import statsmodels.formula.api as smf

import statsmodels.api as sm
from scipy import stats

SAVE_API_MODEL = True # If True : API model ill be saved
API_MODEL_PICKLE_FILE = 'API_model_PJ5.pickle'


# Load data

In [119]:
df = load_data(DATA_PATH_FILE_INPUT)

In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401604 entries, 0 to 401603
Data columns (total 11 columns):
InvoiceNo                401604 non-null object
StockCode                401604 non-null object
Description              401604 non-null object
Quantity                 401604 non-null int64
InvoiceDate              401604 non-null object
UnitPrice                401604 non-null float64
CustomerID               401604 non-null object
Country                  401604 non-null object
TotalPrice               401604 non-null float64
DescriptionNormalized    401604 non-null object
InvoiceMonth             401604 non-null object
dtypes: float64(2), int64(1), object(8)
memory usage: 33.7+ MB


In [121]:
df, df_train, df_test = custom_train_test_split_sample(df, 'TotalPrice')

In [122]:
df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)

In [123]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361443 entries, 0 to 361442
Data columns (total 12 columns):
index                    361443 non-null int64
InvoiceNo                361443 non-null object
StockCode                361443 non-null object
Description              361443 non-null object
Quantity                 361443 non-null int64
InvoiceDate              361443 non-null object
UnitPrice                361443 non-null float64
CustomerID               361443 non-null object
Country                  361443 non-null object
TotalPrice               361443 non-null float64
DescriptionNormalized    361443 non-null object
InvoiceMonth             361443 non-null object
dtypes: float64(2), int64(2), object(8)
memory usage: 33.1+ MB


In [124]:
df_train_ori = df_train.copy(deep=True)
df_test_ori = df_test.copy(deep=True)

# Display some data

In [146]:
df_nocancel = df[df['InvoiceNo'].str.startswith('C') == False]
df_nocancel.reset_index(inplace=True)

df_gbproduct = df_nocancel[['StockCode', 'TotalPrice']].groupby('StockCode').sum()['TotalPrice']

In [126]:
df_nocancel.head(2)

Unnamed: 0,index,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,DescriptionNormalized,InvoiceMonth
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,WHITE HANGING HEART T-LIGHT HOLDER,2010-12-01 00:00:00
1,1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,WHITE METAL LANTERN,2010-12-01 00:00:00


In [127]:
df_nocancel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392732 entries, 0 to 392731
Data columns (total 12 columns):
index                    392732 non-null int64
InvoiceNo                392732 non-null object
StockCode                392732 non-null object
Description              392732 non-null object
Quantity                 392732 non-null int64
InvoiceDate              392732 non-null object
UnitPrice                392732 non-null float64
CustomerID               392732 non-null object
Country                  392732 non-null object
TotalPrice               392732 non-null float64
DescriptionNormalized    392732 non-null object
InvoiceMonth             392732 non-null object
dtypes: float64(2), int64(2), object(8)
memory usage: 36.0+ MB


In [128]:
invoice_dates = pd.to_datetime(df_nocancel["InvoiceDate"], format="%Y-%m-%d ")

In [129]:
invoice_dates = pd.to_datetime(df_nocancel["InvoiceDate"])

In [130]:
np.maximum((pd.to_datetime('2011-12-09 12:50:00') - invoice_dates) / (np.timedelta64(1, "M")), 1)[123456]

7.000987928111688

In [131]:
invoice_dates[2000:2010]

2000   2010-12-02 09:43:00
2001   2010-12-02 09:43:00
2002   2010-12-02 09:43:00
2003   2010-12-02 09:43:00
2004   2010-12-02 09:43:00
2005   2010-12-02 09:43:00
2006   2010-12-02 09:43:00
2007   2010-12-02 09:43:00
2008   2010-12-02 09:43:00
2009   2010-12-02 09:43:00
Name: InvoiceDate, dtype: datetime64[ns]

In [132]:
df_nocancel.loc[2000:2010,'InvoiceDate']

2000    2010-12-02 09:43:00
2001    2010-12-02 09:43:00
2002    2010-12-02 09:43:00
2003    2010-12-02 09:43:00
2004    2010-12-02 09:43:00
2005    2010-12-02 09:43:00
2006    2010-12-02 09:43:00
2007    2010-12-02 09:43:00
2008    2010-12-02 09:43:00
2009    2010-12-02 09:43:00
2010    2010-12-02 09:43:00
Name: InvoiceDate, dtype: object

In [133]:
df_nocancel.loc[100000:100010,'InvoiceMonth']

100000    2011-04-01 00:00:00
100001    2011-04-01 00:00:00
100002    2011-04-01 00:00:00
100003    2011-04-01 00:00:00
100004    2011-04-01 00:00:00
100005    2011-04-01 00:00:00
100006    2011-04-01 00:00:00
100007    2011-04-01 00:00:00
100008    2011-04-01 00:00:00
100009    2011-04-01 00:00:00
100010    2011-04-01 00:00:00
Name: InvoiceMonth, dtype: object

In [139]:
df[df['InvoiceNo'].str.startswith('C') == True]['CustomerID'].unique()

array(['14527', '15311', '17548', ..., '12985', '15951', '16446'],
      dtype=object)

# Top value products (must be saved with the model, and passed to it)

In [155]:
top_value_products = df_gbproduct.sort_values(ascending=False).head(200).index

In [156]:
top_value_products

Index(['23843', '22423', '85123A', '85099B', '23166', 'POST', '47566', '84879',
       'M', '23084',
       ...
       '23208', '23300', '22499', '48173C', '23343', '21080', '23076', '22569',
       '22690', '84997A'],
      dtype='object', name='StockCode', length=200)

# Preparation pipeline

In [25]:
importlib.reload(sys.modules['functions'])

<module 'functions' from '/home/francois/coding/OC/PJ5/functions.py'>

In [26]:
df_train = df_train_ori
df_test = df_test_ori


In [27]:
preparation_pipeline = Pipeline([
    ('features_selector', FeaturesSelector(features_toselect=MODEL_FEATURES)),
    ('bow_encoder', BowEncoder()),
    ('agregate_to_client_level', AgregateToClientLevel())
    
    # Ajouter le log scale du TotalPrice et le MinMaxScale à la fin
    # Faire la réduction dimensionnelle à part pour les bag of words et pour les autres features
    
    #('hour_extractor', HHMM_to_HH()),
    #('data_converter', HHMM_to_Minutes()),
    #('categoricalfeatures_1hotencoder', CategoricalFeatures1HotEncoder()), 
    
    
    #('minmaxscaler', MinMaxScalerMultiple(features_toscale=MODEL_1HOTALL_FEATURES_QUANTITATIVE)),
])

In [28]:
df_train = preparation_pipeline.fit_transform(df_train)

Features selected (in order): Index(['CustomerID', 'DescriptionNormalized', 'InvoiceDate', 'InvoiceMonth',
       'InvoiceNo', 'TotalPrice'],
      dtype='object')
Fit data
categorical_features_totransform == ['DescriptionNormalized']
!! categorical_features_totransform['DescriptionNormalized']
Transform data


In [30]:
df_train.head(6)

Unnamed: 0,CustomerID,DescriptionNormalized,InvoiceDate,InvoiceMonth,InvoiceNo,TotalPrice,DescriptionNormalized_10,DescriptionNormalized_11,DescriptionNormalized_12,DescriptionNormalized_20,DescriptionNormalized_24,DescriptionNormalized_36,DescriptionNormalized_3d,DescriptionNormalized_50,DescriptionNormalized_60,DescriptionNormalized_72,DescriptionNormalized_acapulco,DescriptionNormalized_acrylic,DescriptionNormalized_airline,DescriptionNormalized_alarm,DescriptionNormalized_alphabet,DescriptionNormalized_am,DescriptionNormalized_and,DescriptionNormalized_angel,DescriptionNormalized_animals,DescriptionNormalized_ant,DescriptionNormalized_antique,DescriptionNormalized_apothecary,DescriptionNormalized_apple,DescriptionNormalized_apples,DescriptionNormalized_apron,DescriptionNormalized_area,DescriptionNormalized_art,DescriptionNormalized_assorted,DescriptionNormalized_asstd,DescriptionNormalized_babushka,DescriptionNormalized_baby,DescriptionNormalized_backpack,DescriptionNormalized_bag,DescriptionNormalized_bakelike,DescriptionNormalized_baking,DescriptionNormalized_ball,DescriptionNormalized_balloon,DescriptionNormalized_balloons,DescriptionNormalized_bank,DescriptionNormalized_baroque,DescriptionNormalized_basket,DescriptionNormalized_bath,DescriptionNormalized_bathroom,DescriptionNormalized_baths,DescriptionNormalized_beaker,DescriptionNormalized_bell,DescriptionNormalized_bells,DescriptionNormalized_bicycle,DescriptionNormalized_bill,DescriptionNormalized_billboard,DescriptionNormalized_bin,DescriptionNormalized_bird,DescriptionNormalized_birdhouse,DescriptionNormalized_birthday,DescriptionNormalized_biscuit,DescriptionNormalized_biscuits,DescriptionNormalized_black,DescriptionNormalized_block,DescriptionNormalized_blocks,DescriptionNormalized_blossom,DescriptionNormalized_blue,DescriptionNormalized_board,DescriptionNormalized_book,DescriptionNormalized_books,DescriptionNormalized_botanical,DescriptionNormalized_bottle,DescriptionNormalized_bowl,DescriptionNormalized_bowls,DescriptionNormalized_box,DescriptionNormalized_boxed,DescriptionNormalized_boxes,DescriptionNormalized_bracelet,DescriptionNormalized_bread,DescriptionNormalized_breakfast,DescriptionNormalized_brown,DescriptionNormalized_brush,DescriptionNormalized_bucket,DescriptionNormalized_buffalo,DescriptionNormalized_building,DescriptionNormalized_bundle,DescriptionNormalized_bunny,DescriptionNormalized_bunting,DescriptionNormalized_butterfly,DescriptionNormalized_cabinet,DescriptionNormalized_cake,DescriptionNormalized_cakes,DescriptionNormalized_cakestand,DescriptionNormalized_calendar,DescriptionNormalized_calm,DescriptionNormalized_can,DescriptionNormalized_candle,DescriptionNormalized_candleholder,DescriptionNormalized_candles,DescriptionNormalized_card,...,DescriptionNormalized_star,DescriptionNormalized_stars,DescriptionNormalized_stationery,DescriptionNormalized_stencil,DescriptionNormalized_sticker,DescriptionNormalized_stickers,DescriptionNormalized_sticks,DescriptionNormalized_stocking,DescriptionNormalized_storage,DescriptionNormalized_strawberry,DescriptionNormalized_straws,DescriptionNormalized_string,DescriptionNormalized_stripe,DescriptionNormalized_stripes,DescriptionNormalized_stripey,DescriptionNormalized_strongman,DescriptionNormalized_style,DescriptionNormalized_sugar,DescriptionNormalized_suki,DescriptionNormalized_swallows,DescriptionNormalized_sweet,DescriptionNormalized_sweetheart,DescriptionNormalized_swirly,DescriptionNormalized_sympathy,DescriptionNormalized_table,DescriptionNormalized_tag,DescriptionNormalized_tags,DescriptionNormalized_tails,DescriptionNormalized_tall,DescriptionNormalized_tape,DescriptionNormalized_tea,DescriptionNormalized_teacup,DescriptionNormalized_teapot,DescriptionNormalized_teatime,DescriptionNormalized_the,DescriptionNormalized_thermometer,DescriptionNormalized_tidy,DescriptionNormalized_tier,DescriptionNormalized_tile,DescriptionNormalized_time,DescriptionNormalized_tin,DescriptionNormalized_tins,DescriptionNormalized_tissue,DescriptionNormalized_tissues,DescriptionNormalized_to,DescriptionNormalized_toadstool,DescriptionNormalized_toadstools,DescriptionNormalized_toilet,DescriptionNormalized_tonic,DescriptionNormalized_top,DescriptionNormalized_tote,DescriptionNormalized_towel,DescriptionNormalized_towels,DescriptionNormalized_toy,DescriptionNormalized_traditional,DescriptionNormalized_travel,DescriptionNormalized_tray,DescriptionNormalized_treasure,DescriptionNormalized_tree,DescriptionNormalized_trellis,DescriptionNormalized_trim,DescriptionNormalized_trinket,DescriptionNormalized_triple,DescriptionNormalized_tube,DescriptionNormalized_tv,DescriptionNormalized_umbrella,DescriptionNormalized_union,DescriptionNormalized_up,DescriptionNormalized_vanilla,DescriptionNormalized_victorian,DescriptionNormalized_village,DescriptionNormalized_vintage,DescriptionNormalized_wall,DescriptionNormalized_wallet,DescriptionNormalized_war,DescriptionNormalized_warmer,DescriptionNormalized_washing,DescriptionNormalized_water,DescriptionNormalized_watering,DescriptionNormalized_welcome,DescriptionNormalized_white,DescriptionNormalized_wick,DescriptionNormalized_wicker,DescriptionNormalized_willie,DescriptionNormalized_wine,DescriptionNormalized_winkie,DescriptionNormalized_wire,DescriptionNormalized_with,DescriptionNormalized_wood,DescriptionNormalized_wooden,DescriptionNormalized_woodland,DescriptionNormalized_word,DescriptionNormalized_world,DescriptionNormalized_wrap,DescriptionNormalized_wreath,DescriptionNormalized_writing,DescriptionNormalized_yellow,DescriptionNormalized_you,DescriptionNormalized_your,DescriptionNormalized_zinc
0,14675,STRAWBERRY LUNCH BOX WITH CUTLERY,2011-11-23 12:58:00,2011-11-01 00:00:00,578255,2.55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,16102,36 DOILIES VINTAGE CHRISTMAS,2011-09-02 11:54:00,2011-09-01 00:00:00,565292,17.4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,17509,RIBBON REEL HEARTS DESIGN,2011-10-10 08:58:00,2011-10-01 00:00:00,570249,16.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,16369,DECORATIVE WICKER HEART SMALL,2011-11-21 16:25:00,2011-11-01 00:00:00,577785,15.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,16442,STRAWBERRY LUNCH BOX WITH CUTLERY,2011-12-08 11:43:00,2011-12-01 00:00:00,581310,2.55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5,15883,TEA FOR ONE POLKADOT,2011-12-07 17:05:00,2011-12-01 00:00:00,581192,8.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
