In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import Timedelta
import numpy as np
import warnings
warnings.filterwarnings("ignore")


pd.set_option('display.max_columns', None)

# SMARTPHONE

In [66]:
smartphones = pd.read_csv('./Working Code/Datasets/smartphones1.csv') 
smartphones['DATA_INIZIO'] = pd.to_datetime(smartphones['DATA_INIZIO'])
smartphones['Year'] = smartphones['DATA_INIZIO'].dt.year
smartphones = smartphones[smartphones['Year'] != 2020].reset_index()
smartphones.drop('index', axis=1, inplace=True)

In [67]:
X = smartphones[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
                 'TIPOLOGIA_PRODOTTO', 'SCONTO_PERC', 'KPI_1',
                 'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
                 'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
                 'DISPLAY_SIZE', 'OPERATOR', 'NUMBER_OF_SIM',
                 'CAPACITY', 'GENERATION', 'OPERATING_SYST', 'Month_Redditivity',
                 'SCONTO_PERC_MEDIO_VOLANTINO', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA', 'DATA_INIZIO']]

y = smartphones[['QTA', 'DATA_INIZIO']]

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO', 'OPERATOR']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO', 'OPERATOR'], axis = 1)

In [68]:
# WE WILL USE AS TEST SET AUGUST, SEPTEMBER AND OCTOBER 2023

X_train = final_df[~(((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023))]
X_test = final_df[((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023)]

X_train.drop('DATA_INIZIO', axis=1, inplace=True)
X_test.drop('DATA_INIZIO', axis=1, inplace=True)

y_train = y[~(((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023))]
y_test = y[((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023)]
y_train.drop('DATA_INIZIO', axis=1, inplace=True)
y_test.drop('DATA_INIZIO', axis=1, inplace=True)



## BENCHMARK

In [69]:
from sklearn.metrics import mean_squared_error as mse

bench = X_test.copy(deep=True)
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(y_test['QTA'], bench['Previsione']))

y_train = y_train.values
y_test = y_test.values

print(f'RMSE = {rmse}')

RMSE = 314.9338690616392


## LINEAR REGRESSION

In [70]:
from sklearn.preprocessing import StandardScaler
column_names = X_train.columns.tolist()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'RMSE = {rmse}')

RMSE = 306.9776763954178


In [57]:
print("\nCoefficients:")
for feature, coef in zip(column_names, classifier.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: 17.534558317709934
PREZZO_LISTINO: -158.7584875941524
STOCK_PZ: 117.81219375234733
SCONTO_PERC: 34.323918912757875
KPI_1: 58.906910109075696
KPI_2: -34.5523159948223
KPI_3: 109.24351348194458
KPI_4: 63.29688428520864
KPI_5: 1.2136439095072222
QTA_storico: 152.11634822222175
FATTURATO_storico: -28.603137121481453
DURATA_VOLANTINO_IN_GIORNI: 63.18163542377227
DISPLAY_SIZE: 80.54001923353225
NUMBER_OF_SIM: 0.07980573805180541
CAPACITY: 16.663197451565722
GENERATION: 3.772174381750905
OPERATING_SYST: 104.15737366189298
Month_Redditivity: 15.159451141879194
SCONTO_PERC_MEDIO_VOLANTINO: -21.468844621595412
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 40.8499112827952
TIPOLOGIA_PRODOTTO_ND: -10.121943731106093
TIPOLOGIA_PRODOTTO_Principale: 46.27263681145886
TIPOLOGIA_PRODOTTO_Sostitutivo: -25.092120845551037
OPERATOR_SIM FREE: -11.954221577610245
OPERATOR_TIM: 8.001468513237976
OPERATOR_VODAFONE: 2.1498371869197506
OPERATOR_WINDTRE: 10.711581496254192


## RANDOM FOREST

In [58]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800], 
    'max_features': ['sqrt'], 
    'max_depth': [10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4] 
}

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=100,  
                                   cv=3,  
                                   scoring='neg_mean_squared_error', 
                                   random_state=42
                                   )

random_search.fit(X_train, y_train)

print("Hyperparameters:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')


Hyperparameters:
{'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}
RMSE con i migliori iperparametri = 327.0856605338226


In [71]:
hyperparameters = {'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rfr = RandomForestRegressor(random_state=69, **hyperparameters)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
#r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE = {rmse}')

RMSE = 327.0856605338226


In [60]:
print("Feature Importance:")
for feature, importance in zip(column_names, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.0682482333986906
PREZZO_LISTINO: 0.06049223760459818
STOCK_PZ: 0.14414385156091836
SCONTO_PERC: 0.05077180581787915
KPI_1: 0.05010833635289502
KPI_2: 0.04000710026599493
KPI_3: 0.08517534211305161
KPI_4: 0.03278810215676554
KPI_5: 0.02984694629977493
QTA_storico: 0.15007972697360702
FATTURATO_storico: 0.08638124142453178
DURATA_VOLANTINO_IN_GIORNI: 0.031405076239882646
DISPLAY_SIZE: 0.03527395944427165
NUMBER_OF_SIM: 0.0026322829020369656
CAPACITY: 0.015197082213772492
GENERATION: 0.004561971311100102
OPERATING_SYST: 0.0014228259988885894
Month_Redditivity: 0.011490479583186868
SCONTO_PERC_MEDIO_VOLANTINO: 0.03567568855513191
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.019230135931100754
TIPOLOGIA_PRODOTTO_ND: 0.006368935162313195
TIPOLOGIA_PRODOTTO_Principale: 0.015657218269028658
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.005208435815358113
OPERATOR_SIM FREE: 0.006820208032711593
OPERATOR_TIM: 0.0038949949301724175
OPERATOR_VODAFONE: 0.0030520817004791485
OPERATOR_W

## XGBOOST

In [61]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500], 
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3], 
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 
    'min_child_weight': [1, 2, 3, 4, 5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'gamma': [0, 0.1, 0.2, 0.3, 0.4] 
}

random_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=100, 
                                   cv=3, 
                                   scoring='neg_mean_squared_error', 
                                   random_state=42,
                                   n_jobs = -1)

random_search.fit(X_train, y_train)

print("Migliori iperparametri:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')

Migliori iperparametri:
{'subsample': 1.0, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.6}
RMSE con i migliori iperparametri = 341.2504337269346


In [72]:
hyperparameters = {'subsample': 1.0, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.6}

import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

xgbr = xgb.XGBRegressor(random_state=69, **hyperparameters)
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE = {rmse}')


RMSE = 341.2504337269346


In [63]:

print("Feature Importance:")
for feature, importance in zip(column_names, xgbr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.03458721190690994
PREZZO_LISTINO: 0.0357833132147789
STOCK_PZ: 0.07435755431652069
SCONTO_PERC: 0.03712259978055954
KPI_1: 0.0437902994453907
KPI_2: 0.027001013979315758
KPI_3: 0.028121542185544968
KPI_4: 0.026619447395205498
KPI_5: 0.022879458963871002
QTA_storico: 0.13976448774337769
FATTURATO_storico: 0.02078085206449032
DURATA_VOLANTINO_IN_GIORNI: 0.03696943074464798
DISPLAY_SIZE: 0.05150375887751579
NUMBER_OF_SIM: 0.012760818004608154
CAPACITY: 0.023385334759950638
GENERATION: 0.025875283405184746
OPERATING_SYST: 0.013070090673863888
Month_Redditivity: 0.028848515823483467
SCONTO_PERC_MEDIO_VOLANTINO: 0.023975513875484467
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.01915685646235943
TIPOLOGIA_PRODOTTO_ND: 0.05496830865740776
TIPOLOGIA_PRODOTTO_Principale: 0.05338415503501892
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.0346207395195961
OPERATOR_SIM FREE: 0.03378095477819443
OPERATOR_TIM: 0.023993076756596565
OPERATOR_VODAFONE: 0.03483058512210846
OPERATOR_WINDTRE: 

# PC

In [73]:
pc = pd.read_csv('./Working Code/Datasets/pc1.csv') 
pc['DATA_INIZIO'] = pd.to_datetime(pc['DATA_INIZIO'])
pc['Year'] = pc['DATA_INIZIO'].dt.year
pc = pc[pc['Year'] != 2020].reset_index()
pc.drop('index', axis=1, inplace=True)

In [74]:
X = pc[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'TIPOLOGIA_PRODOTTO','SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'STORAGE_GB', 'RAM_GB', 'DISPLAY_SIZE', 'CONVERTIBLE',
        'SCONTO_PERC_MEDIO_VOLANTINO', 'VERSION_NOT_DEFINED',
        'VERSION_INTEL', 'VERSION_APPLE', 'VERSION_RAD',
        'VERSION_GEF', 'QUALITY_VERSION', 'QUALITY_PROCESSOR',
        'Month_Redditivity', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA', 'DATA_INIZIO']]
y = pc[['QTA', 'DATA_INIZIO']]

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO'], axis = 1)

In [75]:
# WE WILL USE AS TEST SET AUGUST, SEPTEMBER AND OCTOBER 2023

X_train = final_df[~(((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023))]
X_test = final_df[((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023)]

X_train.drop('DATA_INIZIO', axis=1, inplace=True)
X_test.drop('DATA_INIZIO', axis=1, inplace=True)

y_train = y[~(((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023))]
y_test = y[((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023)]
y_train.drop('DATA_INIZIO', axis=1, inplace=True)
y_test.drop('DATA_INIZIO', axis=1, inplace=True)



## BENCHMARK

In [76]:
from sklearn.metrics import mean_squared_error as mse

bench = X_test.copy(deep=True)
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(y_test['QTA'], bench['Previsione']))

y_train = y_train.values
y_test = y_test.values

print(f'RMSE = {rmse}')

RMSE = 257.31527639463883


## LINEAR REGRESSION

In [77]:
from sklearn.preprocessing import StandardScaler
column_names = X_train.columns.tolist()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'RMSE = {rmse}')

RMSE = 178.03943167198634


In [78]:
print("\nCoefficients:")
for feature, coef in zip(column_names, classifier.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: -138.7694415003767
PREZZO_LISTINO: 23.06108478940054
STOCK_PZ: 41.13632302226836
SCONTO_PERC: 14.700580046381019
KPI_1: 55.12115928637761
KPI_2: -24.458657882430362
KPI_3: -4.413866992206581
KPI_4: 22.655119472933187
KPI_5: -19.78091555261102
QTA_storico: 21.17436319211781
FATTURATO_storico: 2.2637614834903017
DURATA_VOLANTINO_IN_GIORNI: 55.52169970789217
STORAGE_GB: 5.254122869917298
RAM_GB: 13.71477918543268
DISPLAY_SIZE: 109.86179156027421
CONVERTIBLE: -14.279831503720741
SCONTO_PERC_MEDIO_VOLANTINO: 35.11049748043353
VERSION_NOT_DEFINED: -23.782773755016287
VERSION_INTEL: -10.158619399487707
VERSION_APPLE: 89.70372032272562
VERSION_RAD: -5.260305820844044
VERSION_GEF: -54.835564631409234
QUALITY_VERSION: 3.0246782903470866
QUALITY_PROCESSOR: 14.763504733216585
Month_Redditivity: 11.740296883413542
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 11.340719472530553
TIPOLOGIA_PRODOTTO_ND: 67.93550317177618
TIPOLOGIA_PRODOTTO_Principale: -33.19858440135238
TIPOLOGIA_PRODO

## RANDOM FOREST

In [80]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500], 
    'max_features': ['auto', 'sqrt'], 
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4], 
    'bootstrap': [True, False] 
}

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=10,  
                                   cv=5,  
                                   scoring='neg_mean_squared_error', 
                                   random_state=42)

random_search.fit(X_train, y_train)

print("Hyperparameters:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')

Hyperparameters:
{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': False}
RMSE con i migliori iperparametri = 164.87563518281718


In [79]:
hyperparameters = {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': False}

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rfr = RandomForestRegressor(random_state=69, **hyperparameters)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
#r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE = {rmse}')

RMSE = 164.87563518281718


In [25]:
print("Feature Importance:")
for feature, importance in zip(column_names, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.03434348106384277
PREZZO_LISTINO: 0.036376625299453735
STOCK_PZ: 0.0765182226896286
SCONTO_PERC: 0.03906714543700218
KPI_1: 0.04586119204759598
KPI_2: 0.02639801800251007
KPI_3: 0.027691323310136795
KPI_4: 0.029260994866490364
KPI_5: 0.02309444360435009
QTA_storico: 0.14724865555763245
FATTURATO_storico: 0.02183818817138672
DURATA_VOLANTINO_IN_GIORNI: 0.03699907287955284
STORAGE_GB: 0.05073227360844612
RAM_GB: 0.00991913489997387
DISPLAY_SIZE: 0.023333264514803886
CONVERTIBLE: 0.03511432185769081
SCONTO_PERC_MEDIO_VOLANTINO: 0.01669209450483322
VERSION_NOT_DEFINED: 0.02501477487385273
VERSION_INTEL: 0.023957757279276848
VERSION_APPLE: 0.017712589353322983
VERSION_RAD: 0.03458438068628311
VERSION_GEF: 0.05390232801437378
QUALITY_VERSION: 0.03423963859677315
QUALITY_PROCESSOR: 0.03709931671619415
Month_Redditivity: 0.02064560167491436
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.035571519285440445
TIPOLOGIA_PRODOTTO_ND: 0.036783616989851


## XGBOOST

In [82]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500], 
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3], 
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 
    'min_child_weight': [1, 2, 3, 4, 5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'gamma': [0, 0.1, 0.2, 0.3, 0.4] 
}

random_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=100, 
                                   cv=3,  
                                   scoring='neg_mean_squared_error', 
                                   random_state=42)

random_search.fit(X_train, y_train)

print("Migliori iperparametri:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')

Migliori iperparametri:
{'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 2, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0.4, 'colsample_bytree': 1.0}
RMSE con i migliori iperparametri = 163.6359501820956


In [81]:
hyperparameters = {'subsample': 0.9, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 1.0}

import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

xgbr = xgb.XGBRegressor(random_state=69, **hyperparameters)
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE = {rmse}')

RMSE = 162.3577202925557


In [83]:
print("Feature Importance:")
for feature, importance in zip(column_names, xgbr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.0737595185637474
PREZZO_LISTINO: 0.03400077670812607
STOCK_PZ: 0.027254972606897354
SCONTO_PERC: 0.027176346629858017
KPI_1: 0.04933350533246994
KPI_2: 0.02869297005236149
KPI_3: 0.022554220631718636
KPI_4: 0.025020718574523926
KPI_5: 0.01722922921180725
QTA_storico: 0.020131496712565422
FATTURATO_storico: 0.02730761654675007
DURATA_VOLANTINO_IN_GIORNI: 0.018234428018331528
STORAGE_GB: 0.05538512393832207
RAM_GB: 0.031536225229501724
DISPLAY_SIZE: 0.07936739921569824
CONVERTIBLE: 0.009405390359461308
SCONTO_PERC_MEDIO_VOLANTINO: 0.048543088138103485
VERSION_NOT_DEFINED: 0.01128406636416912
VERSION_INTEL: 0.027129756286740303
VERSION_APPLE: 0.018814295530319214
VERSION_RAD: 0.025835566222667694
VERSION_GEF: 0.010476211085915565
QUALITY_VERSION: 0.04785388708114624
QUALITY_PROCESSOR: 0.020692160353064537
Month_Redditivity: 0.0317317470908165
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.019315728917717934
TIPOLOGIA_PRODOTTO_ND: 0.07054855674505234
TIPOLOGIA_PRODO

# WASH

In [84]:
wash = pd.read_csv('./Working Code/Datasets/wash1.csv') 
wash['DATA_INIZIO'] = pd.to_datetime(wash['DATA_INIZIO'])
wash['Year'] = wash['DATA_INIZIO'].dt.year
wash = wash[wash['Year'] != 2020].reset_index()
wash.drop('index', axis=1, inplace=True)

In [85]:
X = wash[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'TIPOLOGIA_PRODOTTO','SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'TYPE', 'TOPLOADING', 'LOADING_KG', 'ENERGY_CLASS',
        'DEPTH_CM>48', 'SMART_CONNECT', 'SCONTO_PERC_MEDIO_VOLANTINO', 'Month_Redditivity',
        'SCONTO_PERC_MEDIO_NOME_CAMPAGNA', 'DATA_INIZIO']]
y = wash[['QTA', 'DATA_INIZIO']]

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO'], axis = 1)

In [86]:
# WE WILL USE AS TEST SET AUGUST, SEPTEMBER AND OCTOBER 2023

X_train = final_df[~(((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023))]
X_test = final_df[((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023)]

X_train.drop('DATA_INIZIO', axis=1, inplace=True)
X_test.drop('DATA_INIZIO', axis=1, inplace=True)

y_train = y[~(((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023))]
y_test = y[((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023)]
y_train.drop('DATA_INIZIO', axis=1, inplace=True)
y_test.drop('DATA_INIZIO', axis=1, inplace=True)



## BENCHMARK

In [87]:
from sklearn.metrics import mean_squared_error as mse

bench = X_test.copy(deep=True)
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(y_test['QTA'], bench['Previsione']))

y_train = y_train.values
y_test = y_test.values

print(f'RMSE = {rmse}')

RMSE = 244.3826713055839


## LINEAR REGRESSION

In [88]:
from sklearn.preprocessing import StandardScaler
column_names = X_train.columns.tolist()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'RMSE = {rmse}')

RMSE = 160.3081728814228


In [89]:
print("\nCoefficients:")
for feature, coef in zip(column_names, classifier.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: -93.72977041379716
PREZZO_LISTINO: -13.920315447515343
STOCK_PZ: 7.702751440102822
SCONTO_PERC: 89.18928645286177
KPI_1: 21.944170165718226
KPI_2: 13.098000544983488
KPI_3: -16.2874156964759
KPI_4: -19.441410526173428
KPI_5: 8.565216780023087
QTA_storico: -9.019755657333652
FATTURATO_storico: 24.937779137969404
DURATA_VOLANTINO_IN_GIORNI: 102.83175847055267
TYPE: 30.087505765199236
TOPLOADING: -80.99738402161508
LOADING_KG: -60.686913287988105
ENERGY_CLASS: 20.179099426718864
DEPTH_CM>48: 48.28270370839417
SMART_CONNECT: 3.9503308311919128
SCONTO_PERC_MEDIO_VOLANTINO: -10.7026241603419
Month_Redditivity: -5.944550905768345
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 31.475619408672806
TIPOLOGIA_PRODOTTO_ND: 8.865678461805693
TIPOLOGIA_PRODOTTO_Principale: -5.827109133219141
TIPOLOGIA_PRODOTTO_Sostitutivo: -19.029980327177448


## RANDOM FOREST

In [91]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500], 
    'max_features': ['auto', 'sqrt'], 
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4], 
    'bootstrap': [True, False] 
}

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=100,  
                                   cv=3,  
                                   scoring='neg_mean_squared_error', 
                                   random_state=42)

random_search.fit(X_train, y_train)

print("Hyperparameters:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')

Hyperparameters:
{'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}
RMSE con i migliori iperparametri = 145.91339451585202


In [92]:
hyperparameters = {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rfr = RandomForestRegressor(random_state=69, **hyperparameters)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
#r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE = {rmse}')

RMSE = 145.91339451585202


In [93]:
print("Feature Importance:")
for feature, importance in zip(column_names, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.0737595185637474
PREZZO_LISTINO: 0.03400077670812607
STOCK_PZ: 0.027254972606897354
SCONTO_PERC: 0.027176346629858017
KPI_1: 0.04933350533246994
KPI_2: 0.02869297005236149
KPI_3: 0.022554220631718636
KPI_4: 0.025020718574523926
KPI_5: 0.01722922921180725
QTA_storico: 0.020131496712565422
FATTURATO_storico: 0.02730761654675007
DURATA_VOLANTINO_IN_GIORNI: 0.018234428018331528
TYPE: 0.05538512393832207
TOPLOADING: 0.031536225229501724
LOADING_KG: 0.07936739921569824
ENERGY_CLASS: 0.009405390359461308
DEPTH_CM>48: 0.048543088138103485
SMART_CONNECT: 0.01128406636416912
SCONTO_PERC_MEDIO_VOLANTINO: 0.027129756286740303
Month_Redditivity: 0.018814295530319214
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.025835566222667694
TIPOLOGIA_PRODOTTO_ND: 0.010476211085915565
TIPOLOGIA_PRODOTTO_Principale: 0.04785388708114624
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.020692160353064537


## XGBOOST

In [95]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500], 
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3], 
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 
    'min_child_weight': [1, 2, 3, 4, 5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'gamma': [0, 0.1, 0.2, 0.3, 0.4] 
}

random_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=100, 
                                   cv=3,  
                                   scoring='neg_mean_squared_error', 
                                   random_state=42)

random_search.fit(X_train, y_train)

print("Migliori iperparametri:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')

Migliori iperparametri:
{'subsample': 0.7, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.7}
RMSE con i migliori iperparametri = 159.87009724993806


In [96]:
hyperparameters = {'subsample': 0.7, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.7}

import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

xgbr = xgb.XGBRegressor(random_state=69, **hyperparameters)
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE = {rmse}')

RMSE = 159.87009724993806


In [97]:
print("Feature Importance:")
for feature, importance in zip(column_names, xgbr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.11485069990158081
PREZZO_LISTINO: 0.05403902381658554
STOCK_PZ: 0.021450472995638847
SCONTO_PERC: 0.05353478714823723
KPI_1: 0.024790069088339806
KPI_2: 0.022039873525500298
KPI_3: 0.027293410152196884
KPI_4: 0.0365188866853714
KPI_5: 0.020258944481611252
QTA_storico: 0.030047409236431122
FATTURATO_storico: 0.018488574773073196
DURATA_VOLANTINO_IN_GIORNI: 0.08289302885532379
TYPE: 0.04803519695997238
TOPLOADING: 0.13952435553073883
LOADING_KG: 0.0278888288885355
ENERGY_CLASS: 0.06072785705327988
DEPTH_CM>48: 0.043386854231357574
SMART_CONNECT: 0.0262801144272089
SCONTO_PERC_MEDIO_VOLANTINO: 0.021745959296822548
Month_Redditivity: 0.043600015342235565
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.028933659195899963
TIPOLOGIA_PRODOTTO_ND: 0.020796457305550575
TIPOLOGIA_PRODOTTO_Principale: 0.019015561789274216
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.01385989598929882


# CORE WEAR

In [98]:
core_wear = pd.read_csv('./Working Code/Datasets/core_wear1.csv')
core_wear['DATA_INIZIO'] = pd.to_datetime(core_wear['DATA_INIZIO'])
core_wear['Year'] = core_wear['DATA_INIZIO'].dt.year
core_wear = core_wear[core_wear['Year'] != 2020].reset_index()
core_wear.drop('index', axis=1, inplace=True)

In [99]:
X = core_wear[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'TIPOLOGIA_PRODOTTO','SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'DISPLAY_QUALITY', 'BLUETOOTH', 'WIFI', 'DISPLAY_SIZE',
        'SCONTO_PERC_MEDIO_VOLANTINO', 'Month_Redditivity', 'SCONTO_PERC_MEDIO_NOME_CAMPAGNA', 'DATA_INIZIO']]
y = core_wear[['QTA', 'DATA_INIZIO']]

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
oheTransform = ohe.fit_transform(X[['TIPOLOGIA_PRODOTTO']])
oheTransform = pd.DataFrame(oheTransform, columns=ohe.get_feature_names_out())

final_df = pd.concat([X, oheTransform], axis=1)

final_df = final_df.drop(['TIPOLOGIA_PRODOTTO'], axis = 1)

In [100]:
# WE WILL USE AS TEST SET AUGUST, SEPTEMBER AND OCTOBER 2023

X_train = final_df[~(((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023))]
X_test = final_df[((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023)]

X_train.drop('DATA_INIZIO', axis=1, inplace=True)
X_test.drop('DATA_INIZIO', axis=1, inplace=True)

y_train = y[~(((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023))]
y_test = y[((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023)]
y_train.drop('DATA_INIZIO', axis=1, inplace=True)
y_test.drop('DATA_INIZIO', axis=1, inplace=True)



## BENCHMARK

In [101]:
from sklearn.metrics import mean_squared_error as mse

bench = X_test.copy(deep=True)
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(y_test['QTA'], bench['Previsione']))

y_train = y_train.values
y_test = y_test.values

print(f'RMSE = {rmse}')

RMSE = 150.37084341840824


## LINEAR REGRESSION

In [102]:
from sklearn.preprocessing import StandardScaler
column_names = X_train.columns.tolist()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'RMSE = {rmse}')

RMSE = 141.55032766962748


In [103]:
print("\nCoefficients:")
for feature, coef in zip(column_names, classifier.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: 118.04198694529569
PREZZO_LISTINO: -140.15572590771066
STOCK_PZ: 179.51077123442937
SCONTO_PERC: 35.792333791146866
KPI_1: 65.86165638691479
KPI_2: 32.39755848304105
KPI_3: 32.08800319720749
KPI_4: -3.1772423050651004
KPI_5: -4.151983517869958
QTA_storico: 105.38124914904586
FATTURATO_storico: -15.83582312815802
DURATA_VOLANTINO_IN_GIORNI: 16.838004220327498
DISPLAY_QUALITY: 11.026185479268793
BLUETOOTH: -2.9137679113442454
WIFI: -7.177363599349922
DISPLAY_SIZE: -37.221290978166586
SCONTO_PERC_MEDIO_VOLANTINO: 22.790023823630744
Month_Redditivity: -3.0447137636309023
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: -1.0732269628375464
TIPOLOGIA_PRODOTTO_ND: 12.060272494701326
TIPOLOGIA_PRODOTTO_Principale: -1.4564049824664562
TIPOLOGIA_PRODOTTO_Sostitutivo: -11.903155226467284


## RANDOM FOREST

In [104]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500], 
    'max_features': ['auto', 'sqrt'], 
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4], 
    'bootstrap': [True, False] 
}

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=100,  
                                   cv=3,  
                                   scoring='neg_mean_squared_error', 
                                   random_state=42)

random_search.fit(X_train, y_train)

print("Hyperparameters:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')

Hyperparameters:
{'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}
RMSE con i migliori iperparametri = 131.195608146421


In [106]:
hyperparameters = {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rfr = RandomForestRegressor(random_state=69, **hyperparameters)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
#r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE = {rmse}')

RMSE = 131.195608146421


In [107]:
print("Feature Importance:")
for feature, importance in zip(column_names, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.16191831158855718
PREZZO_LISTINO: 0.12659043170303239
STOCK_PZ: 0.1639400933835142
SCONTO_PERC: 0.04793546871294562
KPI_1: 0.03690320131514168
KPI_2: 0.029139602122520283
KPI_3: 0.05240288049165252
KPI_4: 0.01261214692528774
KPI_5: 0.013210989562682079
QTA_storico: 0.12125312772747923
FATTURATO_storico: 0.04929710874775367
DURATA_VOLANTINO_IN_GIORNI: 0.01981224982305805
DISPLAY_QUALITY: 0.0036405960654144307
BLUETOOTH: 0.00021465579914120103
WIFI: 0.005853720745754182
DISPLAY_SIZE: 0.08209751330326953
SCONTO_PERC_MEDIO_VOLANTINO: 0.018853052613259758
Month_Redditivity: 0.023453317469701612
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.015041363848197369
TIPOLOGIA_PRODOTTO_ND: 0.00589910806240657
TIPOLOGIA_PRODOTTO_Principale: 0.00673744973554746
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.003193610253683366


## XGBOOST

In [108]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500], 
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3], 
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 
    'min_child_weight': [1, 2, 3, 4, 5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'gamma': [0, 0.1, 0.2, 0.3, 0.4] 
}

random_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=20, 
                                   cv=5,  
                                   scoring='neg_mean_squared_error', 
                                   random_state=42)

random_search.fit(X_train, y_train)

print("Migliori iperparametri:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')

Migliori iperparametri:
{'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.9}
RMSE con i migliori iperparametri = 127.01051568818703


In [109]:
hyperparameters = {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.9}

import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

xgbr = xgb.XGBRegressor(random_state=69, **hyperparameters)
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE = {rmse}')

RMSE = 127.01051568818703


In [110]:
print("Feature Importance:")
for feature, importance in zip(column_names, xgbr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.2206956297159195
PREZZO_LISTINO: 0.11870881915092468
STOCK_PZ: 0.08111002296209335
SCONTO_PERC: 0.018021322786808014
KPI_1: 0.06802632659673691
KPI_2: 0.031353287398815155
KPI_3: 0.018974702805280685
KPI_4: 0.018615098670125008
KPI_5: 0.014361381530761719
QTA_storico: 0.07723483443260193
FATTURATO_storico: 0.014256485737860203
DURATA_VOLANTINO_IN_GIORNI: 0.053632453083992004
DISPLAY_QUALITY: 0.010156884789466858
BLUETOOTH: 0.004326631315052509
WIFI: 0.009992126375436783
DISPLAY_SIZE: 0.038872603327035904
SCONTO_PERC_MEDIO_VOLANTINO: 0.017180096358060837
Month_Redditivity: 0.0330706387758255
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.016413243487477303
TIPOLOGIA_PRODOTTO_ND: 0.08865683525800705
TIPOLOGIA_PRODOTTO_Principale: 0.02845672145485878
TIPOLOGIA_PRODOTTO_Sostitutivo: 0.017883753404021263


# TV

In [111]:
tv = pd.read_csv('./Working Code/Datasets/tv1.csv') 
tv['DATA_INIZIO'] = pd.to_datetime(tv['DATA_INIZIO'])
tv['Year'] = tv['DATA_INIZIO'].dt.year
tv = tv[tv['Year'] != 2020].reset_index()
tv.drop('index', axis=1, inplace=True) 

In [112]:
X = tv[['PREZZO_PROMO', 'PREZZO_LISTINO', 'STOCK_PZ',
        'SCONTO_PERC', 'KPI_1',
        'KPI_2', 'KPI_3', 'KPI_4', 'KPI_5', 'QTA_storico',
        'FATTURATO_storico', 'DURATA_VOLANTINO_IN_GIORNI',
        'DISPLAY_SIZE', 'HD_QUALITY', 'OLED', 'SMART',
        'MFM', 'WCG', 'MINILED', 'SCONTO_PERC_MEDIO_VOLANTINO', 'Month_Redditivity',
        'SCONTO_PERC_MEDIO_NOME_CAMPAGNA', 'DATA_INIZIO']]
y = tv[['QTA', 'DATA_INIZIO']]

final_df = X

In [113]:
# WE WILL USE AS TEST SET AUGUST, SEPTEMBER AND OCTOBER 2023

X_train = final_df[~(((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023))]
X_test = final_df[((final_df['DATA_INIZIO'].dt.month == 8) | (final_df['DATA_INIZIO'].dt.month == 9) | (final_df['DATA_INIZIO'].dt.month == 10)) & (final_df['DATA_INIZIO'].dt.year == 2023)]

X_train.drop('DATA_INIZIO', axis=1, inplace=True)
X_test.drop('DATA_INIZIO', axis=1, inplace=True)

y_train = y[~(((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023))]
y_test = y[((y['DATA_INIZIO'].dt.month == 8) | (y['DATA_INIZIO'].dt.month == 9) | (y['DATA_INIZIO'].dt.month == 10)) & (y['DATA_INIZIO'].dt.year == 2023)]
y_train.drop('DATA_INIZIO', axis=1, inplace=True)
y_test.drop('DATA_INIZIO', axis=1, inplace=True)



## BENCHMARK

In [114]:
from sklearn.metrics import mean_squared_error as mse

bench = X_test.copy(deep=True)
bench['Previsione'] = bench['QTA_storico']/30
bench['Previsione'] *= bench['DURATA_VOLANTINO_IN_GIORNI']

rmse = np.sqrt(mse(y_test['QTA'], bench['Previsione']))

y_train = y_train.values
y_test = y_test.values

print(f'RMSE = {rmse}')

RMSE = 176.05099673189127


## LINEAR REGRESSION

In [115]:

from sklearn.preprocessing import StandardScaler
column_names = X_train.columns.tolist()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'RMSE = {rmse}')

RMSE = 221.59542912478605


In [None]:
print("\nCoefficients:")
for feature, coef in zip(column_names, classifier.coef_.tolist()[0]):
    print(f"{feature}: {coef}")


Coefficients:
PREZZO_PROMO: -54.97200550000509
PREZZO_LISTINO: 43.6910213487729
STOCK_PZ: 106.08775583736939
SCONTO_PERC: 5.380081840777786
KPI_1: 26.28705765415378
KPI_2: -23.64399387275635
KPI_3: -0.1937888254122382
KPI_4: 15.869674959644327
KPI_5: 52.00910545119117
QTA_storico: 85.73805603357593
FATTURATO_storico: 0.12751744047357036
DURATA_VOLANTINO_IN_GIORNI: 43.9338691007949
DISPLAY_SIZE: -23.24928248891727
HD_QUALITY: -77.7439544531675
OLED: -0.3607525653709332
SMART: 13.374124261005202
MFM: -9.24027300358304
WCG: -30.35886991498585
MINILED: 5.49215014518227
SCONTO_PERC_MEDIO_VOLANTINO: -1.2624199161045275
Month_Redditivity: 27.869248577086363
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 38.97369038569735


## RANDOM FOREST

In [117]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500], 
    'max_features': ['auto', 'sqrt'], 
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4], 
    'bootstrap': [True, False] 
}

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=100,  
                                   cv=3,  
                                   scoring='neg_mean_squared_error', 
                                   random_state=42)

random_search.fit(X_train, y_train)

print("Hyperparameters:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')

Hyperparameters:
{'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}
RMSE con i migliori iperparametri = 202.8039203004248


In [118]:
hyperparameters = {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rfr = RandomForestRegressor(random_state=69, **hyperparameters)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
#r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE = {rmse}')

RMSE = 202.8039203004248


In [119]:
print("Feature Importance:")
for feature, importance in zip(column_names, rfr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.11473594016031426
PREZZO_LISTINO: 0.0895401901919833
STOCK_PZ: 0.0935117868426444
SCONTO_PERC: 0.06839071053002645
KPI_1: 0.048866548163420946
KPI_2: 0.0383082607677952
KPI_3: 0.06844460414906146
KPI_4: 0.04565854037818285
KPI_5: 0.04103478716784798
QTA_storico: 0.1063163795908548
FATTURATO_storico: 0.06690909276340098
DURATA_VOLANTINO_IN_GIORNI: 0.028745234379494836
DISPLAY_SIZE: 0.04497895834898972
HD_QUALITY: 0.030062143619115304
OLED: 0.0014001652117161412
SMART: 0.006165214505908794
MFM: 0.001275014045003325
WCG: 0.007698848887849501
MINILED: 0.0002084100029456676
SCONTO_PERC_MEDIO_VOLANTINO: 0.05792544847927301
Month_Redditivity: 0.014486076930736594
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.02533764488343455


## XGBOOST

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500], 
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3], 
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 
    'min_child_weight': [1, 2, 3, 4, 5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  
    'gamma': [0, 0.1, 0.2, 0.3, 0.4] 
}

random_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=69),
                                   param_distributions=param_grid,
                                   n_iter=1000, 
                                   cv=3,  
                                   scoring='neg_mean_squared_error', 
                                   random_state=42,
                                   n_jobs=-1)

random_search.fit(X_train, y_train)

print("Migliori iperparametri:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE con i migliori iperparametri = {rmse}')

In [143]:
hyperparameters = {'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 2, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}

import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

xgbr = xgb.XGBRegressor(random_state=69, **hyperparameters)
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE = {rmse}')

RMSE = 174.58032381245732


In [144]:
print("Feature Importance:")
for feature, importance in zip(column_names, xgbr.feature_importances_):
    print(f"{feature}: {importance}")

Feature Importance:
PREZZO_PROMO: 0.08307284116744995
PREZZO_LISTINO: 0.04726877063512802
STOCK_PZ: 0.05090203136205673
SCONTO_PERC: 0.04794443026185036
KPI_1: 0.038954317569732666
KPI_2: 0.030493415892124176
KPI_3: 0.048473045229911804
KPI_4: 0.02254926785826683
KPI_5: 0.02982579730451107
QTA_storico: 0.07260873168706894
FATTURATO_storico: 0.04504093527793884
DURATA_VOLANTINO_IN_GIORNI: 0.022079210728406906
DISPLAY_SIZE: 0.11065320670604706
HD_QUALITY: 0.13343355059623718
OLED: 0.052585311233997345
SMART: 0.02962680347263813
MFM: 0.009874059818685055
WCG: 0.01753782480955124
MINILED: 0.010058106854557991
SCONTO_PERC_MEDIO_VOLANTINO: 0.050159960985183716
Month_Redditivity: 0.02180098183453083
SCONTO_PERC_MEDIO_NOME_CAMPAGNA: 0.025057435035705566
