In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import Timedelta
import numpy as np
import warnings
warnings.filterwarnings("ignore")


pd.set_option('display.max_columns', None)

# SMARTPHONE

In [2]:
smartphones = pd.read_csv('./Working Code/Datasets/smartphones1.csv') 
smartphones['DATA_INIZIO'] = pd.to_datetime(smartphones['DATA_INIZIO'])
smartphones['Year'] = smartphones['DATA_INIZIO'].dt.year
smartphones = smartphones[smartphones['Year'] != 2020].reset_index()  #EXCLUDE YEAR 2020
smartphones.drop('index', axis=1, inplace=True)

In [3]:
X = smartphones[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
                 'TIPOLOGIA_PRODOTTO', 'SCONTO_PERC', 'KPI_1',
                 'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
                 'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
                 'DISPLAY_SIZE', 'OPERATOR', 'NUMBER_OF_SIM',
                 'CAPACITY', 'GENERATION', 'OPERATING_SYST', 'Month_Redditivity',
                 'SCONTO_PERC_MEDIO_VOLANTINO', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]

y = smartphones[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO', 'OPERATOR']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO', 'OPERATOR'], axis = 1)

## BENCHMARK

In [4]:
from sklearn.metrics import mean_squared_error as mse

bench = smartphones.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 541.5361289135686


## REGRESSIONE LINEARE

In [5]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [399.30463261 346.01846887 459.78963058 429.17489509 367.73041507
 404.13396227 468.794116   495.48481639 390.22263758 413.30378771]
Linear Regression Average RMSE: 417.3957362165429


In [6]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: 5.699541817809937
PREZZO_LISTINO: -143.632549811455
STOCK_PZ: 119.6034824423928
SCONTO_PERC: 30.064083623235916
KPI_1: 56.15785069044351
KPI_2: -34.77930251001062
KPI_3: 103.71025690493967
KPI_4: 61.1829212565718
KPI_5: 2.1286885489866134
QTA_storico: 148.7580268497485
FATTURATO_storico: -22.58382743469818
DURATA_VOLANTINO_IN_GIORNI: 64.17891224673913
DISPLAY_SIZE: 76.04472890882558
NUMBER_OF_SIM: 3.8864102279123354
CAPACITY: 14.584544943253508
GENERATION: -1.187181002185497
OPERATING_SYST: 100.91077738902237
Month_Redditivity: 11.54606300139728
SCONTO_PERC_MEDIO_VOLANTINO: -23.021068036076198
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 44.52655091867933
TIPOLOGIA_PRODOTTO_ND: 142437655188947.72
TIPOLOGIA_PRODOTTO_Principale: 102703014634913.19
TIPOLOGIA_PRODOTTO_Sostitutivo: 127294839211484.78
OPERATOR_SIM FREE: -7734699012003.718
OPERATOR_TIM: -4902691671936.123
OPERATOR_VODAFONE: -4721766651496.661
OPERATOR_WINDTRE: -4072240593371.4214


## RANDOM FOREST

In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [340.04600646 328.73209675 370.56911456 379.43232263 306.65376017
 359.15148308 390.22680105 411.22336201 368.92804444 346.31158298]
Random Forest Average RMSE: 360.1274574148954


In [8]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.05424684775368942
PREZZO_LISTINO: 0.03423861810464885
STOCK_PZ: 0.16325192611086556
SCONTO_PERC: 0.049082747532182044
KPI_1: 0.055917995254037504
KPI_2: 0.031891545873539075
KPI_3: 0.04606279139679284
KPI_4: 0.028730138064133985
KPI_5: 0.01854016520961762
QTA_storico: 0.3127612981156704
FATTURATO_storico: 0.029556565602794755
DURATA_VOLANTINO_IN_GIORNI: 0.039996102792718764
DISPLAY_SIZE: 0.03670581334645954
NUMBER_OF_SIM: 0.0015971670886612596
CAPACITY: 0.01086608367463311
GENERATION: 0.0019134627716746013
OPERATING_SYST: 0.0006153975549616861
Month_Redditivity: 0.005849322300450878
SCONTO_PERC_MEDIO_VOLANTINO: 0.03140072825990671
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.01472032771764468
TIPOLOGIA_PRODOTTO_ND: 0.0031490059751239875
TIPOLOGIA_PRODOTTO_Principale: 0.010926984142317386
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.0027927165415685107
OPERATOR_SIM FREE: 0.004729663105919104
OPERATOR_TIM: 0.003336119203382926
OPERATOR_VODAFONE: 0.003688915563261139
OPERAT

## XGBoost

In [9]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [342.52157819 314.63734424 341.50783431 371.05608936 288.76909502
 329.90104892 342.45124379 406.8388123  334.11303998 320.28596057]
XGBoost Average RMSE: 339.208204666886


In [10]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.032499752938747406
PREZZO_LISTINO: 0.03104330413043499
STOCK_PZ: 0.05625305697321892
SCONTO_PERC: 0.032209694385528564
KPI_1: 0.04434063658118248
KPI_2: 0.026875706389546394
KPI_3: 0.02311650663614273
KPI_4: 0.032319072633981705
KPI_5: 0.015949439257383347
QTA_storico: 0.15576879680156708
FATTURATO_storico: 0.014598820358514786
DURATA_VOLANTINO_IN_GIORNI: 0.04014192521572113
DISPLAY_SIZE: 0.04848195239901543
NUMBER_OF_SIM: 0.010876871645450592
CAPACITY: 0.017412003129720688
GENERATION: 0.029084276407957077
OPERATING_SYST: 0.015887971967458725
Month_Redditivity: 0.030888812616467476
SCONTO_PERC_MEDIO_VOLANTINO: 0.022525975480675697
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.01855411008000374
TIPOLOGIA_PRODOTTO_ND: 0.023656675592064857
TIPOLOGIA_PRODOTTO_Principale: 0.046026166528463364
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.01696990802884102
OPERATOR_SIM FREE: 0.06590153276920319
OPERATOR_TIM: 0.03257433697581291
OPERATOR_VODAFONE: 0.0687551200389862
OPERATOR_WIND

# PC

In [11]:
pc = pd.read_csv('./Working Code/Datasets/pc1.csv') 
pc['DATA_INIZIO'] = pd.to_datetime(pc['DATA_INIZIO'])
pc['Year'] = pc['DATA_INIZIO'].dt.year
pc = pc[pc['Year'] != 2020].reset_index()  #EXCLUDE YEAR 2020
pc.drop('index', axis=1, inplace=True)

In [12]:
X = pc[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'TIPOLOGIA_PRODOTTO','SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'STORAGE_GB', 'RAM_GB', 'DISPLAY_SIZE', 'CONVERTIBLE',
        'SCONTO_PERC_MEDIO_VOLANTINO', 'VERSION_NOT_DEFINED',
        'VERSION_INTEL', 'VERSION_APPLE', 'VERSION_RAD',
        'VERSION_GEF', 'QUALITY_VERSION', 'QUALITY_PROCESSOR',
        'Month_Redditivity', 'Year', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]
y = pc[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO', 'Year']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO', 'Year'], axis = 1)

## BENCHMARK

In [13]:
from sklearn.metrics import mean_squared_error as mse

bench = pc.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 406.77545296404577


## LINEAR REGRESSION

In [14]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [274.66703814 210.21204842 333.46857714 263.71950428 286.8618647
 222.51948775 257.81397043 370.19475489 242.29561888 241.55601659]
Linear Regression Average RMSE: 270.3308881209832


In [15]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: -123.39979781838744
PREZZO_LISTINO: 10.68425047668801
STOCK_PZ: 43.253463154086816
SCONTO_PERC: 12.496209054097847
KPI_1: 45.2904932206431
KPI_2: -15.421839610167423
KPI_3: -6.645169450958147
KPI_4: 22.048831510880245
KPI_5: -24.778339787324683
QTA_storico: 25.824654515833764
FATTURATO_storico: -0.6815856679290959
DURATA_VOLANTINO_IN_GIORNI: 70.93012564153211
STORAGE_GB: 7.777630193838639
RAM_GB: 10.698387297747153
DISPLAY_SIZE: 97.00343521810387
CONVERTIBLE: -13.84038677105983
SCONTO_PERC_MEDIO_VOLANTINO: 66.93984798949786
VERSION_NOT_DEFINED: -20.839566071496943
VERSION_INTEL: -10.031906518005774
VERSION_APPLE: 79.23755920995868
VERSION_RAD: -2.782915264257386
VERSION_GEF: -51.1771540754773
QUALITY_VERSION: -0.2969231162857892
QUALITY_PROCESSOR: 17.659584816340217
Month_Redditivity: 9.62884842983655
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 9.123984555890846
TIPOLOGIA_PRODOTTO_ND: 93.91188354061153
TIPOLOGIA_PRODOTTO_Principale: -50.339211539341406
TIPOLOGIA_PRODO

## RANDOM FOREST

In [16]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [249.90776722 187.4337974  243.25474683 223.76319227 249.82394546
 227.56226877 243.16922906 330.00074318 236.07140435 228.1009554 ]
Random Forest Average RMSE: 241.908804995021


In [17]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.1706217701097797
PREZZO_LISTINO: 0.09299061810841083
STOCK_PZ: 0.08738848556466981
SCONTO_PERC: 0.05202423596971098
KPI_1: 0.06096623395644825
KPI_2: 0.019034299644153152
KPI_3: 0.03452277086311535
KPI_4: 0.036891875370043355
KPI_5: 0.020529316128276767
QTA_storico: 0.0356922695820713
FATTURATO_storico: 0.027315544985396206
DURATA_VOLANTINO_IN_GIORNI: 0.02851911014913904
STORAGE_GB: 0.03616935098556028
RAM_GB: 0.010748078163930466
DISPLAY_SIZE: 0.035898069021851525
CONVERTIBLE: 0.0007625053926391781
SCONTO_PERC_MEDIO_VOLANTINO: 0.11005906317485183
VERSION_NOT_DEFINED: 0.0003476382234584586
VERSION_INTEL: 0.002981548664194282
VERSION_APPLE: 0.0006318584791145912
VERSION_RAD: 0.004731591745037395
VERSION_GEF: 0.004522640809289173
QUALITY_VERSION: 0.015058850158862387
QUALITY_PROCESSOR: 0.004446489452746464
Month_Redditivity: 0.01316764204714495
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.019070133001593798
TIPOLOGIA_PRODOTTO_ND: 0.022264297706780293
TIPOLOGIA_P

## XGBoost

In [18]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [260.74863198 195.03356171 238.66237291 240.05538791 256.97066156
 207.09042498 215.97451237 333.8415826  227.76539949 229.0922001 ]
XGBoost Average RMSE: 240.52347356077917


In [19]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.023024704307317734
PREZZO_LISTINO: 0.049192339181900024
STOCK_PZ: 0.01083252765238285
SCONTO_PERC: 0.016061196103692055
KPI_1: 0.021009644493460655
KPI_2: 0.01237213984131813
KPI_3: 0.01303675677627325
KPI_4: 0.013712843880057335
KPI_5: 0.006830899976193905
QTA_storico: 0.012572884559631348
FATTURATO_storico: 0.01265389658510685
DURATA_VOLANTINO_IN_GIORNI: 0.014728573150932789
STORAGE_GB: 0.04109904542565346
RAM_GB: 0.017708228901028633
DISPLAY_SIZE: 0.07224392890930176
CONVERTIBLE: 0.010369821451604366
SCONTO_PERC_MEDIO_VOLANTINO: 0.02791629731655121
VERSION_NOT_DEFINED: 0.01088472455739975
VERSION_INTEL: 0.01579396240413189
VERSION_APPLE: 0.020218074321746826
VERSION_RAD: 0.01694084331393242
VERSION_GEF: 0.005778848193585873
QUALITY_VERSION: 0.03492075204849243
QUALITY_PROCESSOR: 0.01989382691681385
Month_Redditivity: 0.023401441052556038
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.005345299374312162
TIPOLOGIA_PRODOTTO_ND: 0.16411669552326202
TIPOLOGIA_PROD

# WASH

In [20]:
wash = pd.read_csv('./Working Code/Datasets/wash1.csv')
wash['DATA_INIZIO'] = pd.to_datetime(wash['DATA_INIZIO'])
wash['Year'] = wash['DATA_INIZIO'].dt.year
wash = wash[wash['Year'] != 2020].reset_index()  #EXCLUDE YEAR 2020
wash.drop('index', axis=1, inplace=True)

In [21]:
X = wash[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'TIPOLOGIA_PRODOTTO','SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'TYPE', 'TOPLOADING', 'LOADING_KG', 'ENERGY_CLASS',
        'DEPTH_CM>48', 'SMART_CONNECT', 'SCONTO_PERC_MEDIO_VOLANTINO', 'Month_Redditivity', 'Year',
        'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]
y = wash[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO', 'Year']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO', 'Year'], axis = 1)

## BENCHMARK

In [22]:
from sklearn.metrics import mean_squared_error as mse

bench = wash.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 374.44030989644216


## LINEAR REGRESSION

In [23]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [212.54947152 283.36791038 219.71456837 206.51636029 169.06654341
 210.81887281 209.98256001 164.72641186 250.35866263 200.2190312 ]
Linear Regression Average RMSE: 212.73203924758485


In [24]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: -85.02528829782874
PREZZO_LISTINO: -12.060949908717497
STOCK_PZ: 15.762985025122392
SCONTO_PERC: 79.90525503926041
KPI_1: 19.328585557642633
KPI_2: 9.026341875040263
KPI_3: -13.80014779111255
KPI_4: 2.7273340123657297
KPI_5: 17.442246820426032
QTA_storico: 4.964240195194885
FATTURATO_storico: 10.419269498952435
DURATA_VOLANTINO_IN_GIORNI: 91.46685724891088
TYPE: 25.19511631064143
TOPLOADING: -59.620282537553464
LOADING_KG: -32.91040443840494
ENERGY_CLASS: 1.1300660435302599
DEPTH_CM>48: 28.986044309117798
SMART_CONNECT: 6.327467548230932
SCONTO_PERC_MEDIO_VOLANTINO: 5.9055950190157285
Month_Redditivity: -10.607262735371396
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 21.144082587539586
TIPOLOGIA_PRODOTTO_ND: 5.449380413567215
TIPOLOGIA_PRODOTTO_Principale: -3.018243354337912
TIPOLOGIA_PRODOTTO_Sostitutivo: -16.25576754602029
Year_2018: 38.2897744089877
Year_2019: 14.9447332395084
Year_2021: -22.79446611969669
Year_2022: -8.215612490719474
Year_2023: -9.872754220877482


## RANDOM FOREST

In [25]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [211.40295879 231.12055545 186.04629565 179.24714679 165.58157449
 182.01184137 175.99058208 137.26334805 222.13651097 175.74967731]
Random Forest Average RMSE: 186.65504909310536


In [26]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.3215077968233402
PREZZO_LISTINO: 0.03581538771377743
STOCK_PZ: 0.03530136396127201
SCONTO_PERC: 0.111471528179402
KPI_1: 0.02439188010183893
KPI_2: 0.0320296040014657
KPI_3: 0.03186054586738708
KPI_4: 0.05985556799401717
KPI_5: 0.01998638649149565
QTA_storico: 0.020128469243244632
FATTURATO_storico: 0.0194350166790906
DURATA_VOLANTINO_IN_GIORNI: 0.1284129302235503
TYPE: 0.00018816135959100148
TOPLOADING: 0.0016812522343460435
LOADING_KG: 0.01622947089436754
ENERGY_CLASS: 0.036985982892623316
DEPTH_CM>48: 0.0013920844605196509
SMART_CONNECT: 0.006575031608843544
SCONTO_PERC_MEDIO_VOLANTINO: 0.03167329492222231
Month_Redditivity: 0.014070653379514231
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.01820218279746129
TIPOLOGIA_PRODOTTO_ND: 0.0011140461637069625
TIPOLOGIA_PRODOTTO_Principale: 0.0016737858496054634
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.0004267309937333292
Year_2018: 0.017738092528797256
Year_2019: 0.0035825786427450866
Year_2021: 0.004112059213209886
Year_

## XGBoost

In [27]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [188.04661625 240.09950646 187.24531936 174.24995909 165.07393546
 180.25716743 171.56434675 161.46995817 184.38189266 166.95907059]
XGBoost Average RMSE: 181.93477722442702


In [28]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.07973040640354156
PREZZO_LISTINO: 0.015998519957065582
STOCK_PZ: 0.008429051376879215
SCONTO_PERC: 0.046168792992830276
KPI_1: 0.012669000774621964
KPI_2: 0.009366994723677635
KPI_3: 0.012493573129177094
KPI_4: 0.02263393998146057
KPI_5: 0.009812884032726288
QTA_storico: 0.012135456316173077
FATTURATO_storico: 0.008450315333902836
DURATA_VOLANTINO_IN_GIORNI: 0.10825470089912415
TYPE: 0.000523527676705271
TOPLOADING: 0.10810583084821701
LOADING_KG: 0.014831963926553726
ENERGY_CLASS: 0.08609554916620255
DEPTH_CM>48: 0.024483684450387955
SMART_CONNECT: 0.024477439001202583
SCONTO_PERC_MEDIO_VOLANTINO: 0.010701475664973259
Month_Redditivity: 0.07567855715751648
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.018364695832133293
TIPOLOGIA_PRODOTTO_ND: 0.007295545190572739
TIPOLOGIA_PRODOTTO_Principale: 0.015408295206725597
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.03589335456490517
Year_2018: 0.1455061435699463
Year_2019: 0.028043435886502266
Year_2021: 0.023271625861525536
Ye

# CORE WEAR

In [29]:
core_wear = pd.read_csv('./Working Code/Datasets/core_wear1.csv')
core_wear['DATA_INIZIO'] = pd.to_datetime(core_wear['DATA_INIZIO'])
core_wear['Year'] = core_wear['DATA_INIZIO'].dt.year
core_wear = core_wear[core_wear['Year'] != 2020].reset_index()  #EXCLUDE YEAR 2020
core_wear.drop('index', axis=1, inplace=True)

In [30]:
X = core_wear[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'TIPOLOGIA_PRODOTTO','SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'DISPLAY_QUALITY', 'BLUETOOTH', 'WIFI', 'DISPLAY_SIZE',
        'SCONTO_PERC_MEDIO_VOLANTINO', 'Month_Redditivity', 'Year', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]
y = core_wear[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO', 'Year']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO', 'Year'], axis = 1)

## BENCHMARK

In [31]:
from sklearn.metrics import mean_squared_error as mse

bench = core_wear.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 299.85758113598865


## LINEAR REGRESSION

In [32]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [286.18155151 157.02504165 191.68184059 329.38280846 198.99742458
 198.952401   152.31912906 168.8486142  306.32869187 214.73634579]
Linear Regression Average RMSE: 220.44538486949764


In [33]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: 98.76942075548955
PREZZO_LISTINO: -120.25578965319983
STOCK_PZ: 172.84355406670508
SCONTO_PERC: 31.674855487402684
KPI_1: 53.45898917636532
KPI_2: 30.501884647262248
KPI_3: 33.84223429559056
KPI_4: 3.568766656487847
KPI_5: -5.021659765250094
QTA_storico: 106.91214678102695
FATTURATO_storico: -17.61006143503543
DURATA_VOLANTINO_IN_GIORNI: 22.936144208873966
DISPLAY_QUALITY: 8.450273393112631
BLUETOOTH: -5.071175256340083
WIFI: -4.869931019605111
DISPLAY_SIZE: -33.54502867951699
SCONTO_PERC_MEDIO_VOLANTINO: 17.955564453137924
Month_Redditivity: 4.907844145792728
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: -3.5131688051390264
TIPOLOGIA_PRODOTTO_ND: -43535851570465.05
TIPOLOGIA_PRODOTTO_Principale: -35676650104562.27
TIPOLOGIA_PRODOTTO_Sostitutivo: -40765676292911.47
Year_2018: -38451276107868.98
Year_2019: -50095131963803.484
Year_2021: -74254283560557.83
Year_2022: -82949759937764.72
Year_2023: -74075575230608.8


## RANDOM FOREST

In [34]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [195.07541693 132.79187775 164.22345511 411.14029576 198.31436502
 152.18492202 110.36987209 137.04648358 178.9486731  175.53519491]
Random Forest Average RMSE: 185.56305562792596


In [35]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.31066664232010216
PREZZO_LISTINO: 0.02752283252820325
STOCK_PZ: 0.2276110821421233
SCONTO_PERC: 0.03377915436555159
KPI_1: 0.03497119279752135
KPI_2: 0.0216684491531623
KPI_3: 0.02699292228308076
KPI_4: 0.008877022936601338
KPI_5: 0.008219348349458675
QTA_storico: 0.1849766702838756
FATTURATO_storico: 0.01412578910823588
DURATA_VOLANTINO_IN_GIORNI: 0.022969333245935277
DISPLAY_QUALITY: 0.0015800258627066222
BLUETOOTH: 0.0004608266230238464
WIFI: 0.0003831870840644018
DISPLAY_SIZE: 0.014688626910138265
SCONTO_PERC_MEDIO_VOLANTINO: 0.0175250270161127
Month_Redditivity: 0.0213635840556108
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.0061902801483887816
TIPOLOGIA_PRODOTTO_ND: 0.001461834734450495
TIPOLOGIA_PRODOTTO_Principale: 0.0030653954489799308
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.0011821031087579064
Year_2018: 9.659982559716466e-05
Year_2019: 0.0003718572780801043
Year_2021: 0.005642967848932432
Year_2022: 0.0013450428860344798
Year_2023: 0.002262201655270533


## XGBoost

In [36]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [181.26279544 123.94969863 161.58015532 412.02222921 198.50525477
 256.92790251  96.57978877 109.05872309 183.02550201 164.49379137]
XGBoost Average RMSE: 188.74058411095092


In [37]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.2601602375507355
PREZZO_LISTINO: 0.020450409501791
STOCK_PZ: 0.06115761026740074
SCONTO_PERC: 0.01252376101911068
KPI_1: 0.04126065596938133
KPI_2: 0.026684671640396118
KPI_3: 0.008171679452061653
KPI_4: 0.011483368463814259
KPI_5: 0.00778677174821496
QTA_storico: 0.10698121041059494
FATTURATO_storico: 0.006918222643435001
DURATA_VOLANTINO_IN_GIORNI: 0.029554707929491997
DISPLAY_QUALITY: 0.018456213176250458
BLUETOOTH: 0.0029159041587263346
WIFI: 0.002270021243020892
DISPLAY_SIZE: 0.01585221104323864
SCONTO_PERC_MEDIO_VOLANTINO: 0.015196160413324833
Month_Redditivity: 0.02552313543856144
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.004435498733073473
TIPOLOGIA_PRODOTTO_ND: 0.22508850693702698
TIPOLOGIA_PRODOTTO_Principale: 0.005218462087213993
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.01920890249311924
Year_2018: 0.0002370686415815726
Year_2019: 0.006637048441916704
Year_2021: 0.049451082944869995
Year_2022: 0.0010244565783068538
Year_2023: 0.015352035872638226


# TV

In [38]:
tv = pd.read_csv('./Working Code/Datasets/tv1.csv') 
tv['DATA_INIZIO'] = pd.to_datetime(tv['DATA_INIZIO'])
tv['Year'] = tv['DATA_INIZIO'].dt.year
tv = tv[tv['Year'] != 2020].reset_index()  #EXCLUDE YEAR 2020
tv.drop('index', axis=1, inplace=True) 

In [39]:
X = tv[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'DISPLAY_SIZE', 'HD_QUALITY', 'OLED', 'SMART',
        'MFM', 'WCG', 'MINILED', 'SCONTO_PERC_MEDIO_VOLANTINO', 'Month_Redditivity',
        'Year', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA']]
y = tv[['QTA']].values

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['Year']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['Year'], axis = 1)

## BENCHMARK

In [40]:
from sklearn.metrics import mean_squared_error as mse

bench = tv.copy(deep=True)
bench = bench.dropna(subset=['QTA'])
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(bench['QTA'], bench['Previsione']))

print(f'RMSE = {rmse}')

RMSE = 399.47158405944776


## LINEAR REGRESSION

In [41]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
final_df_scaled = scaler.fit_transform(final_df)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LinearRegression()

lr_scores = cross_val_score(lr, final_df_scaled, y, cv=kf, scoring='neg_mean_squared_error')
lr_rmse_scores = np.sqrt(-lr_scores)

print(f"Linear Regression RMSE scores per fold: {lr_rmse_scores}")
print(f"Linear Regression Average RMSE: {lr_rmse_scores.mean()}")

Linear Regression RMSE scores per fold: [323.19336451 303.4929569  290.20487487 278.39455518 302.6866927
 255.77998823 511.38986648 263.63394978 236.59929532 231.34397537]
Linear Regression Average RMSE: 299.67195193289814


In [42]:
column_names = final_df.columns.tolist()

lr.fit(final_df_scaled, y)

print("\nCoefficients:")
for feature, coef in zip(column_names, lr.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: 13.461367386992
PREZZO_LISTINO: -44.91865518041408
STOCK_PZ: 84.8258116630344
SCONTO_PERC: 51.521336080735125
KPI_1: 25.85859016291787
KPI_2: -9.963507614180248
KPI_3: 23.599443926610114
KPI_4: -2.2649740071414093
KPI_5: 25.2318926995451
QTA_storico: 80.75212810112525
FATTURATO_storico: -24.988800415182077
DURATA_VOLANTINO_IN_GIORNI: 75.26224338203362
DISPLAY_SIZE: -49.685395266951694
HD_QUALITY: -95.44720563821251
OLED: -5.903102106975784
SMART: 38.6570945015274
MFM: -4.429267106091771
WCG: -24.738847318594967
MINILED: -2.5602444291931747
SCONTO_PERC_MEDIO_VOLANTINO: 22.735144577536396
Month_Redditivity: 33.43352084557082
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 32.421504864508854
Year_2018: 13.92113265049297
Year_2019: -15.033746589628688
Year_2021: 33.83521689554803
Year_2022: 10.932122455674271
Year_2023: -48.7363282235248


## RANDOM FOREST

In [43]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=10, shuffle=True, random_state=42)
rfr = RandomForestRegressor()

rfr_scores = cross_val_score(rfr, final_df, y, cv=kf, scoring='neg_mean_squared_error')
rfr_rmse_scores = np.sqrt(-rfr_scores)

print(f"Random Forest RMSE scores per fold: {rfr_rmse_scores}")
print(f"Random Forest Average RMSE: {rfr_rmse_scores.mean()}")

Random Forest RMSE scores per fold: [334.15114318 303.59344077 308.92988125 267.0012453  297.44900724
 262.11265741 465.43973154 223.46950925 241.7487937  235.63828323]
Random Forest Average RMSE: 293.953369288072


In [44]:
rfr.fit(final_df, y)

print("Feature Importance:")
for feature, importance in zip(final_df.columns, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.20615782672206545
PREZZO_LISTINO: 0.04624224379539963
STOCK_PZ: 0.11674017744571674
SCONTO_PERC: 0.06578978319793716
KPI_1: 0.04103109099628173
KPI_2: 0.030679218888065362
KPI_3: 0.046644665086384964
KPI_4: 0.023669268632608743
KPI_5: 0.026801472589663845
QTA_storico: 0.1720575695702539
FATTURATO_storico: 0.049204926158081905
DURATA_VOLANTINO_IN_GIORNI: 0.026236307339956206
DISPLAY_SIZE: 0.014348274305982438
HD_QUALITY: 0.005680962823390373
OLED: 0.00010753670461626869
SMART: 0.0017941985702209925
MFM: 0.0006533379908260178
WCG: 0.0017291232676180878
MINILED: 4.85796898606418e-05
SCONTO_PERC_MEDIO_VOLANTINO: 0.06376400058840286
Month_Redditivity: 0.00947478555599921
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.021350043942431042
Year_2018: 0.003344725310835771
Year_2019: 0.0033893855376835526
Year_2021: 0.007205252412137268
Year_2022: 0.010532957179222641
Year_2023: 0.005322285698357097


## XGBoost

In [45]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score

xgb_model = xgb.XGBRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)
xgb_scores = cross_val_score(xgb_model, final_df, y.ravel(), cv=kf, scoring='neg_mean_squared_error')
xgb_rmse_scores = np.sqrt(-xgb_scores)

print(f"XGBoost RMSE scores per fold: {xgb_rmse_scores}")
print(f"XGBoost Average RMSE: {xgb_rmse_scores.mean()}")

XGBoost RMSE scores per fold: [327.54174752 276.20433432 318.03415787 269.32311311 267.39492737
 281.98693439 467.6297806  235.41796955 248.16342783 239.64891818]
XGBoost Average RMSE: 293.13453107499697


In [46]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(final_df, y.ravel())

print("Feature Importance:")
for feature, importance in zip(final_df.columns, xgb_model.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.08433965593576431
PREZZO_LISTINO: 0.014649502001702785
STOCK_PZ: 0.026334092020988464
SCONTO_PERC: 0.029544295743107796
KPI_1: 0.010828118771314621
KPI_2: 0.02727380022406578
KPI_3: 0.03201192989945412
KPI_4: 0.009447255171835423
KPI_5: 0.014794709160923958
QTA_storico: 0.07458850741386414
FATTURATO_storico: 0.013800864107906818
DURATA_VOLANTINO_IN_GIORNI: 0.03268468379974365
DISPLAY_SIZE: 0.02062912844121456
HD_QUALITY: 0.027758410200476646
OLED: 0.0007654921500943601
SMART: 0.04148885980248451
MFM: 0.1664208471775055
WCG: 0.003943315707147121
MINILED: 0.00043265652493573725
SCONTO_PERC_MEDIO_VOLANTINO: 0.04337825998663902
Month_Redditivity: 0.024125369265675545
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.03452567383646965
Year_2018: 0.013891573995351791
Year_2019: 0.020845696330070496
Year_2021: 0.06358927488327026
Year_2022: 0.10309073328971863
Year_2023: 0.06481732428073883
