In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import Timedelta
import numpy as np
import warnings
warnings.filterwarnings("ignore")


pd.set_option('display.max_columns', None)

# SMARTPHONE

In [2]:
smartphones = pd.read_csv('./Working Code/Datasets/smartphones1.csv') 

In [3]:
X = smartphones[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
                 'TIPOLOGIA_PRODOTTO', 'SCONTO_PERC', 'KPI_1',
                 'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
                 'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
                 'DISPLAY_SIZE', 'OPERATOR', 'NUMBER_OF_SIM',
                 'CAPACITY', 'GENERATION', 'OPERATING_SYST', 'Month_Redditivity', 'Year',
                 'SCONTO_PERC_MEDIO_VOLANTINO', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]
y = smartphones[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO', 'OPERATOR', 'Year']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO', 'OPERATOR', 'Year'], axis = 1)

## BENCHMARK

Our benchmark consists of multiplying the average daily sales per product by the days the flyer is online

In [4]:

from sklearn.metrics import mean_squared_error as mse

bench = smartphones.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 560.964044318735


## REGRESSIONE LINEARE

In [5]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [387.98317872 383.66799769 408.35662719 477.69460499 416.47195921
 431.32142846 375.13961058 394.80470503 599.06876356 464.11491227]
Linear Regression Average RMSE: 433.8623787697528


In [6]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: 89.59451333774543
PREZZO_LISTINO: -204.76404583311978
STOCK_PZ: 121.17951797137482
SCONTO_PERC: 41.03912149905196
KPI_1: 62.079898332851904
KPI_2: -28.03368825675091
KPI_3: 100.2718838081358
KPI_4: 74.96525807712786
KPI_5: 0.8536218965956865
QTA_storico: 149.02465746064485
FATTURATO_storico: -3.5388103862884432
DURATA_VOLANTINO_IN_GIORNI: 63.75247839072945
DISPLAY_SIZE: 41.166733326246415
NUMBER_OF_SIM: -11.652266072837818
CAPACITY: 20.08169922763983
GENERATION: -20.487933644459407
OPERATING_SYST: 77.37179883561758
Month_Redditivity: 43.86587004224931
SCONTO_PERC_MEDIO_VOLANTINO: 4.018875798233514
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 28.27184531238472
TIPOLOGIA_PRODOTTO_ND: 150458682409655.72
TIPOLOGIA_PRODOTTO_Principale: 104398302555223.39
TIPOLOGIA_PRODOTTO_Sostitutivo: 131186979120303.69
OPERATOR_SIM FREE: 130931663178879.45
OPERATOR_TIM: 79991080730751.05
OPERATOR_VODAFONE: 83766711241117.4
OPERATOR_WINDTRE: 68030510088703.95
Year_COVID: -18794897594089.34

## RANDOM FOREST

In [6]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [342.86923389 317.35524288 347.18851015 387.65229762 345.86212656
 400.51550387 322.54816895 360.30887006 470.61115106 390.15908124]
Random Forest Average RMSE: 368.50701862839753


In [7]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.052204181110103505
PREZZO_LISTINO: 0.02712499756443851
STOCK_PZ: 0.13510755486776177
SCONTO_PERC: 0.04593173597292109
KPI_1: 0.08280704642927933
KPI_2: 0.029009393551002762
KPI_3: 0.04180711707547522
KPI_4: 0.029499944296485805
KPI_5: 0.016226648726635103
QTA_storico: 0.34008472093407943
FATTURATO_storico: 0.02936153097775698
DURATA_VOLANTINO_IN_GIORNI: 0.02944614708248621
DISPLAY_SIZE: 0.02985497851436045
NUMBER_OF_SIM: 0.0013433366562624165
CAPACITY: 0.01219751588265356
GENERATION: 0.0017443744660710747
OPERATING_SYST: 0.0005526826107185999
Month_Redditivity: 0.006027952519446364
SCONTO_PERC_MEDIO_VOLANTINO: 0.032193821340966614
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.01414665388346554
TIPOLOGIA_PRODOTTO_ND: 0.0015581870602613917
TIPOLOGIA_PRODOTTO_Principale: 0.007501158884815568
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.0025596233770920677
OPERATOR_SIM FREE: 0.0036012084765195455
OPERATOR_TIM: 0.002230470979660051
OPERATOR_VODAFONE: 0.0034941204519086193
OPER

## XGBoost

In [8]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [313.69550689 329.0097739  325.95609309 381.63725812 346.26636486
 374.32544375 300.32323234 354.09284919 428.00629042 362.31697241]
XGBoost Average RMSE: 351.5629784970521


In [9]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.02413618378341198
PREZZO_LISTINO: 0.020300980657339096
STOCK_PZ: 0.04457062855362892
SCONTO_PERC: 0.025412552058696747
KPI_1: 0.04950347915291786
KPI_2: 0.01902691461145878
KPI_3: 0.01521324273198843
KPI_4: 0.021379491314291954
KPI_5: 0.015769246965646744
QTA_storico: 0.15081554651260376
FATTURATO_storico: 0.011695192195475101
DURATA_VOLANTINO_IN_GIORNI: 0.027948083356022835
DISPLAY_SIZE: 0.03997332230210304
NUMBER_OF_SIM: 0.010641309432685375
CAPACITY: 0.014994444325566292
GENERATION: 0.011707930825650692
OPERATING_SYST: 0.009996605105698109
Month_Redditivity: 0.017152458429336548
SCONTO_PERC_MEDIO_VOLANTINO: 0.017643511295318604
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.013715360313653946
TIPOLOGIA_PRODOTTO_ND: 0.04108757898211479
TIPOLOGIA_PRODOTTO_Principale: 0.029171323403716087
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.016062594950199127
OPERATOR_SIM FREE: 0.033843107521533966
OPERATOR_TIM: 0.020716538652777672
OPERATOR_VODAFONE: 0.03135155141353607
OPERATOR_

# PC

In [5]:
pc = pd.read_csv('./Working Code/Datasets/pc1.csv') 

In [6]:
X = pc[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'TIPOLOGIA_PRODOTTO','SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'STORAGE_GB', 'RAM_GB', 'DISPLAY_SIZE', 'CONVERTIBLE',
        'SCONTO_PERC_MEDIO_VOLANTINO', 'VERSION_NOT_DEFINED',
        'VERSION_INTEL', 'VERSION_APPLE', 'VERSION_RAD',
        'VERSION_GEF', 'QUALITY_VERSION', 'QUALITY_PROCESSOR',
        'Month_Redditivity', 'Year', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]
y = pc[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO', 'Year']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO', 'Year'], axis = 1)

## BENCHMARK

In [7]:
from sklearn.metrics import mean_squared_error as mse

bench = pc.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 420.96770029356077


## LINEAR REGRESSION

In [13]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [266.61096966 334.82275409 271.98714692 319.9776972  311.37090606
 241.3886163  300.25994251 257.50240456 260.86793625 272.18837438]
Linear Regression Average RMSE: 283.6976747923588


In [14]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: -104.47904977820562
PREZZO_LISTINO: -4.479317858557824
STOCK_PZ: 36.178620338331726
SCONTO_PERC: 10.123848265972653
KPI_1: 53.62487192961656
KPI_2: -6.760468754108146
KPI_3: 2.2418922700884045
KPI_4: 15.206781169774587
KPI_5: -23.499833980311752
QTA_storico: 46.33357309776275
FATTURATO_storico: -18.37310295212628
DURATA_VOLANTINO_IN_GIORNI: 53.81387519789695
STORAGE_GB: 1.267891827893151
RAM_GB: 21.303720391967083
DISPLAY_SIZE: 104.49417380107266
CONVERTIBLE: -14.652722242546268
SCONTO_PERC_MEDIO_VOLANTINO: 28.240710723917598
VERSION_NOT_DEFINED: -23.46606225389116
VERSION_INTEL: -0.6742668399782262
VERSION_APPLE: 68.46124833902796
VERSION_RAD: 3.0810787866712093
VERSION_GEF: -53.46190829671742
QUALITY_VERSION: 0.12870195817129693
QUALITY_PROCESSOR: 11.004802988255756
Month_Redditivity: 8.985756659447837
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 30.126775724175904
TIPOLOGIA_PRODOTTO_ND: 77.29214888492645
TIPOLOGIA_PRODOTTO_Principale: -41.753903950383176
TIPOLOGIA_P

## RANDOM FOREST

In [15]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [271.04389127 303.76289418 244.47126654 289.34099406 282.02172039
 218.57602422 278.805499   230.39434126 211.17772197 273.99664187]
Random Forest Average RMSE: 260.35909947529274


In [16]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.09284596616800192
PREZZO_LISTINO: 0.1556259835304594
STOCK_PZ: 0.08236918369961202
SCONTO_PERC: 0.04682179457737161
KPI_1: 0.08582456735911619
KPI_2: 0.03115337381723927
KPI_3: 0.036560997205427326
KPI_4: 0.04450703772890848
KPI_5: 0.023261955036515493
QTA_storico: 0.03698761084875058
FATTURATO_storico: 0.027870099377745268
DURATA_VOLANTINO_IN_GIORNI: 0.02878186613179083
STORAGE_GB: 0.015233012713569617
RAM_GB: 0.01240692002654614
DISPLAY_SIZE: 0.05256404889297229
CONVERTIBLE: 0.00037341083197829655
SCONTO_PERC_MEDIO_VOLANTINO: 0.10088750947652718
VERSION_NOT_DEFINED: 0.00031945828072396556
VERSION_INTEL: 0.005704984240199796
VERSION_APPLE: 0.0008794525685356478
VERSION_RAD: 0.004115384926118034
VERSION_GEF: 0.003524781597440699
QUALITY_VERSION: 0.012076737558881142
QUALITY_PROCESSOR: 0.004601468545042682
Month_Redditivity: 0.008121001455372128
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.021569278510222122
TIPOLOGIA_PRODOTTO_ND: 0.028106990274596552
TIPOLOGIA

## XGBoost

In [17]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [276.96197546 296.55922859 256.85806344 291.73871207 259.71991994
 211.61272483 277.89664302 221.42522022 220.55286913 271.88395665]
XGBoost Average RMSE: 258.52093133626147


In [18]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.01608029566705227
PREZZO_LISTINO: 0.05011957883834839
STOCK_PZ: 0.011292696930468082
SCONTO_PERC: 0.012758077122271061
KPI_1: 0.02795729786157608
KPI_2: 0.013249324634671211
KPI_3: 0.013064149767160416
KPI_4: 0.012272121384739876
KPI_5: 0.011822122149169445
QTA_storico: 0.014677504077553749
FATTURATO_storico: 0.011305294930934906
DURATA_VOLANTINO_IN_GIORNI: 0.015070145949721336
STORAGE_GB: 0.02327827550470829
RAM_GB: 0.04239007458090782
DISPLAY_SIZE: 0.07268953323364258
CONVERTIBLE: 0.024642307311296463
SCONTO_PERC_MEDIO_VOLANTINO: 0.022859696298837662
VERSION_NOT_DEFINED: 0.011583608575165272
VERSION_INTEL: 0.021241283044219017
VERSION_APPLE: 0.04189718887209892
VERSION_RAD: 0.005541618447750807
VERSION_GEF: 0.0038524200208485126
QUALITY_VERSION: 0.01558564230799675
QUALITY_PROCESSOR: 0.02759852632880211
Month_Redditivity: 0.008536459878087044
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.008320294320583344
TIPOLOGIA_PRODOTTO_ND: 0.21995025873184204
TIPOLOGIA_

# WASH

In [19]:
wash = pd.read_csv('./Working Code/Datasets/wash1.csv') 

In [20]:
X = wash[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'TIPOLOGIA_PRODOTTO','SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'TYPE', 'TOPLOADING', 'LOADING_KG', 'ENERGY_CLASS',
        'DEPTH_CM>48', 'SMART_CONNECT', 'SCONTO_PERC_MEDIO_VOLANTINO', 'Month_Redditivity', 'Year',
        'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]
y = wash[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO', 'Year']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO', 'Year'], axis = 1)

## BENCHMARK

In [21]:
from sklearn.metrics import mean_squared_error as mse

bench = wash.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 380.2144885995804


## LINEAR REGRESSION

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [225.92227645 256.49182295 190.30071461 242.13494386 244.08410054
 256.82024865 206.16965371 207.06830401 224.67012361 210.79569932]
Linear Regression Average RMSE: 226.4457887713116


In [24]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: -80.50101651608371
PREZZO_LISTINO: -17.896830070840636
STOCK_PZ: 15.84264150195746
SCONTO_PERC: 80.41498185868315
KPI_1: 18.668369363862272
KPI_2: 13.926087025970821
KPI_3: -11.04645004642272
KPI_4: 4.752161515249825
KPI_5: 14.034271849376612
QTA_storico: 4.465114955730477
FATTURATO_storico: 7.221721111338954
DURATA_VOLANTINO_IN_GIORNI: 94.61804712441827
TYPE: 20.285302243657288
TOPLOADING: -56.62768743737433
LOADING_KG: -28.411136917011618
ENERGY_CLASS: -1.6288579444210574
DEPTH_CM>48: 25.59694477711006
SMART_CONNECT: 4.7326048796128015
SCONTO_PERC_MEDIO_VOLANTINO: -1.5922537133684935
Month_Redditivity: 5.855502945914851
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 27.21312116631884
TIPOLOGIA_PRODOTTO_ND: 7.720401002118092
TIPOLOGIA_PRODOTTO_Principale: -5.559236046113472
TIPOLOGIA_PRODOTTO_Sostitutivo: -15.384033529657195
Year_COVID: -24.346582869479285
Year_POST_COVID: -3.1666292284169426
Year_PRE_COVID: 31.859314045759437


## RANDOM FOREST

In [23]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [178.78653604 218.4309478  200.66966456 209.88725107 192.45641156
 208.7955757  194.68932061 167.37504778 205.43360797 166.73975151]
Random Forest Average RMSE: 194.3264114588316


In [26]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.2860511933553144
PREZZO_LISTINO: 0.025655716551598887
STOCK_PZ: 0.03106649261129721
SCONTO_PERC: 0.12854938563004048
KPI_1: 0.02889171386531852
KPI_2: 0.02945379834939563
KPI_3: 0.03413756667073382
KPI_4: 0.06863753800537802
KPI_5: 0.022483693072749913
QTA_storico: 0.01793990499325171
FATTURATO_storico: 0.019762426066062013
DURATA_VOLANTINO_IN_GIORNI: 0.16343466015206376
TYPE: 0.0002983724933863497
TOPLOADING: 0.0003263442459782513
LOADING_KG: 0.010245961847662848
ENERGY_CLASS: 0.010170917358556067
DEPTH_CM>48: 0.0017645297243704208
SMART_CONNECT: 0.005418183671646396
SCONTO_PERC_MEDIO_VOLANTINO: 0.04572879250743726
Month_Redditivity: 0.00868817568395662
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.017460827143186277
TIPOLOGIA_PRODOTTO_ND: 0.0012570390656949679
TIPOLOGIA_PRODOTTO_Principale: 0.0019715463576678183
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.00026169184556446717
Year_COVID: 0.0040909516139983814
Year_POST_COVID: 0.0017642725919058168
Year_PRE_COVID: 0.034

## XGBoost

In [24]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [186.71821747 212.25694785 200.53454398 207.00025948 191.980312
 217.97568829 193.42591026 166.66016883 206.06863423 168.18199812]
XGBoost Average RMSE: 195.08026805190624


In [25]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.09155365079641342
PREZZO_LISTINO: 0.012232553213834763
STOCK_PZ: 0.010479087010025978
SCONTO_PERC: 0.07134723663330078
KPI_1: 0.01686110906302929
KPI_2: 0.015372331254184246
KPI_3: 0.01789514534175396
KPI_4: 0.03676408901810646
KPI_5: 0.013056125491857529
QTA_storico: 0.011358445510268211
FATTURATO_storico: 0.010274804197251797
DURATA_VOLANTINO_IN_GIORNI: 0.1643996238708496
TYPE: 0.0
TOPLOADING: 0.0
LOADING_KG: 0.01826060377061367
ENERGY_CLASS: 0.014944680966436863
DEPTH_CM>48: 0.02831684984266758
SMART_CONNECT: 0.012578881345689297
SCONTO_PERC_MEDIO_VOLANTINO: 0.023950373753905296
Month_Redditivity: 0.02479427121579647
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.0285690575838089
TIPOLOGIA_PRODOTTO_ND: 0.03069634921848774
TIPOLOGIA_PRODOTTO_Principale: 0.025041092187166214
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.024008318781852722
Year_COVID: 0.03942021355032921
Year_POST_COVID: 0.015133017674088478
Year_PRE_COVID: 0.24269205331802368


# CORE WEAR

In [27]:
core_wear = pd.read_csv('./Working Code/Datasets/core_wear1.csv') 

In [28]:
X = core_wear[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'TIPOLOGIA_PRODOTTO','SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'DISPLAY_QUALITY', 'BLUETOOTH', 'WIFI', 'DISPLAY_SIZE',
        'SCONTO_PERC_MEDIO_VOLANTINO', 'Month_Redditivity', 'Year', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]
y = core_wear[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO', 'Year']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO', 'Year'], axis = 1)

## BENCHMARK

In [29]:
from sklearn.metrics import mean_squared_error as mse

bench = core_wear.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 301.5367598519154


## LINEAR REGRESSION

In [30]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [179.48523689 224.55724371 309.90870447 354.11716744 292.5629471
 161.64326951 156.29192419 281.56669054 166.03254691 203.11843897]
Linear Regression Average RMSE: 232.9284169726456


In [33]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: 91.16859819359848
PREZZO_LISTINO: -115.5257704804783
STOCK_PZ: 166.27203105224575
SCONTO_PERC: 27.944477774928355
KPI_1: 48.872946319297235
KPI_2: 24.637891744722687
KPI_3: 36.72807127339222
KPI_4: 7.385951713878828
KPI_5: -4.8504143074372
QTA_storico: 104.0724671679581
FATTURATO_storico: -21.446095710826178
DURATA_VOLANTINO_IN_GIORNI: 16.72220081232369
DISPLAY_QUALITY: 8.85948074482248
BLUETOOTH: -5.617153761966526
WIFI: -1.7944909026994162
DISPLAY_SIZE: -30.329443876801534
SCONTO_PERC_MEDIO_VOLANTINO: 17.316658060592264
Month_Redditivity: 18.19409895017826
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: -1.0033824744148665
TIPOLOGIA_PRODOTTO_ND: -1.8503311478178701
TIPOLOGIA_PRODOTTO_Principale: 5.7687010752498455
TIPOLOGIA_PRODOTTO_Sostitutivo: -2.9643284988406884
Year_COVID: 18.099013058062365
Year_POST_COVID: -16.038019039581087
Year_PRE_COVID: -1.3457157409089886


## RANDOM FOREST

In [31]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [188.30514724 166.52102108 184.94045274 300.32267717 130.43725968
 141.39389658 163.07847913 297.48095849 126.10139584 167.14311045]
Random Forest Average RMSE: 186.57243983893457


In [32]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.293791963057688
PREZZO_LISTINO: 0.02241494458738513
STOCK_PZ: 0.23256347676116285
SCONTO_PERC: 0.027536647947121896
KPI_1: 0.039723123041817535
KPI_2: 0.02582313586696859
KPI_3: 0.02321235493572802
KPI_4: 0.012495735468445287
KPI_5: 0.010652848020223166
QTA_storico: 0.18744794307213353
FATTURATO_storico: 0.015215041925786524
DURATA_VOLANTINO_IN_GIORNI: 0.01862830993615674
DISPLAY_QUALITY: 0.0022927547432609915
BLUETOOTH: 0.00043426266172210955
WIFI: 0.0003963785301479295
DISPLAY_SIZE: 0.018592256447938903
SCONTO_PERC_MEDIO_VOLANTINO: 0.01805517643334572
Month_Redditivity: 0.03308694583796522
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.005711859909446425
TIPOLOGIA_PRODOTTO_ND: 0.0010020279179977411
TIPOLOGIA_PRODOTTO_Principale: 0.0019368314734037554
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.0014004568974378421
Year_COVID: 0.004483096795479018
Year_POST_COVID: 0.0024720909139449338
Year_PRE_COVID: 0.000630336817292157


## XGBoost

In [33]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [195.28222125 176.19558642 164.81641247 297.89467711 224.28430842
 207.82385524 217.11970823 330.20614284 126.34614173 162.11843497]
XGBoost Average RMSE: 210.20874886726574


In [34]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.22588419914245605
PREZZO_LISTINO: 0.03144478797912598
STOCK_PZ: 0.0493842251598835
SCONTO_PERC: 0.011248852126300335
KPI_1: 0.07655955106019974
KPI_2: 0.02050107717514038
KPI_3: 0.013392888940870762
KPI_4: 0.011406655423343182
KPI_5: 0.008884632028639317
QTA_storico: 0.10651448369026184
FATTURATO_storico: 0.00823717750608921
DURATA_VOLANTINO_IN_GIORNI: 0.025511471554636955
DISPLAY_QUALITY: 0.031461138278245926
BLUETOOTH: 0.004714273847639561
WIFI: 0.004258707631379366
DISPLAY_SIZE: 0.02103389799594879
SCONTO_PERC_MEDIO_VOLANTINO: 0.012009812518954277
Month_Redditivity: 0.0072911945171654224
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.007538649719208479
TIPOLOGIA_PRODOTTO_ND: 0.1409490555524826
TIPOLOGIA_PRODOTTO_Principale: 0.011985477060079575
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.016649674624204636
Year_COVID: 0.055879440158605576
Year_POST_COVID: 0.09612061083316803
Year_PRE_COVID: 0.001138034276664257


# TV

In [35]:
tv = pd.read_csv('./Working Code/Datasets/tv1.csv') 

In [36]:
X = tv[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'DISPLAY_SIZE', 'HD_QUALITY', 'OLED', 'SMART',
        'MFM', 'WCG', 'MINILED', 'SCONTO_PERC_MEDIO_VOLANTINO', 'Month_Redditivity',
        'Year', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]
y = tv[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['Year']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['Year'], axis = 1)

## BENCHMARK

In [37]:
from sklearn.metrics import mean_squared_error as mse

bench = tv.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 420.9253442726455


## LINEAR REGRESSION

In [38]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [307.31547116 306.8680023  269.55159616 319.02917918 234.68757962
 304.70563182 290.68911597 336.83398852 240.61246639 518.78660299]
Linear Regression Average RMSE: 312.90796341032626


In [39]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: -3.5634011153718905
PREZZO_LISTINO: -18.59453342156977
STOCK_PZ: 129.84730874995876
SCONTO_PERC: 46.52953630914751
KPI_1: 42.10195073997686
KPI_2: -16.307318611139127
KPI_3: 5.620192532468837
KPI_4: 2.6305128595209846
KPI_5: 18.72083013801395
QTA_storico: 103.54025185306214
FATTURATO_storico: -23.815394201060766
DURATA_VOLANTINO_IN_GIORNI: 66.30386319138856
DISPLAY_SIZE: -41.70162564202628
HD_QUALITY: -84.12826583361807
OLED: -3.4887982590227398
SMART: 40.84785710482938
MFM: 1.5732616470771323
WCG: -28.00584017843344
MINILED: -0.2513358868938059
SCONTO_PERC_MEDIO_VOLANTINO: -16.58130021782537
Month_Redditivity: 24.876851681073727
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 38.851759490874045
Year_COVID: 13.809996897538761
Year_POST_COVID: -8.248107103398615
Year_PRE_COVID: -6.087296432791706


## RANDOM FOREST

In [40]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [298.65287172 279.17002124 247.06873644 311.62527511 215.96946097
 292.25202221 284.94859523 384.50963063 229.27328422 484.58649448]
Random Forest Average RMSE: 302.80563922530956


In [41]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.15908573635720621
PREZZO_LISTINO: 0.03665521766454854
STOCK_PZ: 0.2065740926993592
SCONTO_PERC: 0.07216485315349153
KPI_1: 0.05855043230621396
KPI_2: 0.03059566018120907
KPI_3: 0.039803399864958476
KPI_4: 0.0199546557907136
KPI_5: 0.02238592496789392
QTA_storico: 0.16961234117460006
FATTURATO_storico: 0.04203919128750274
DURATA_VOLANTINO_IN_GIORNI: 0.024754066189573584
DISPLAY_SIZE: 0.016533019941281996
HD_QUALITY: 0.005890542706348907
OLED: 0.0001379007795597618
SMART: 0.001521079893828259
MFM: 0.00036955862626032986
WCG: 0.001831289960280168
MINILED: 3.54218417818228e-05
SCONTO_PERC_MEDIO_VOLANTINO: 0.04602949313500707
Month_Redditivity: 0.007738729600347159
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.020448683948759476
Year_COVID: 0.00430982126822284
Year_POST_COVID: 0.005641998901709187
Year_PRE_COVID: 0.0073368877593421805


## XGBoost

In [42]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [291.37209429 280.83900969 230.17308965 325.63163335 216.61790789
 338.76382447 285.41237851 370.34967798 218.87293422 454.8391452 ]
XGBoost Average RMSE: 301.2871695251239


In [43]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.1017080768942833
PREZZO_LISTINO: 0.02155967615544796
STOCK_PZ: 0.06783775240182877
SCONTO_PERC: 0.05595667287707329
KPI_1: 0.04516518488526344
KPI_2: 0.046180881559848785
KPI_3: 0.028865812346339226
KPI_4: 0.012091819196939468
KPI_5: 0.02003631927073002
QTA_storico: 0.08438415825366974
FATTURATO_storico: 0.039382435381412506
DURATA_VOLANTINO_IN_GIORNI: 0.047860756516456604
DISPLAY_SIZE: 0.026080893352627754
HD_QUALITY: 0.02061224915087223
OLED: 0.003964218311011791
SMART: 0.04107169434428215
MFM: 0.005983822979032993
WCG: 0.004907253663986921
MINILED: 0.0019471992272883654
SCONTO_PERC_MEDIO_VOLANTINO: 0.03187764063477516
Month_Redditivity: 0.039803776890039444
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.03370236977934837
Year_COVID: 0.062110960483551025
Year_POST_COVID: 0.06626830250024796
Year_PRE_COVID: 0.0906400978565216
