In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [8]:
def data_preprocess(data):
    data.drop(columns=['index', 'id_product'], inplace=True)

    data['Product Life cycel status'] = data['Product Life cycel status'].fillna('Active')

    data['Reference proxy'] = [int(s[10:]) for s in data['Reference proxy']]
    
    '''
    Month 1 missing data process: 
    '''
    data = data.fillna('0') 

    data['Month 1'] = [int(str(s).replace(' ', '')) for s in data['Month 1'].values]
    data['Month 2'] = [int(str(s).replace(' ', '')) for s in data['Month 2'].values]
    data['Month 3'] = [int(str(s).replace(' ', '')) for s in data['Month 3'].values]

    return data

In [9]:
#data_train = pd.read_csv('datasets_hi4/train-data.csv', sep=';')
data_train = pd.read_csv('C:\\Users\\sabri\\OneDrive\\Documentos\\hackathon\\train-data.csv', sep=';')


y = data_train['Month 4'].values
y = np.array([int(s.replace(' ', '')) for s in y])
data_train.drop(columns=['Month 4'], inplace=True)

data = data_preprocess(data_train)

In [10]:
X_values = data[['Reference proxy', 'Month 1', 'Month 2', 'Month 3']].values
data.drop(columns=['Reference proxy', 'Month 1', 'Month 2', 'Month 3'], inplace=True)

X_Encodes = data.values

# Label encoding
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(X_Encodes)
X_Encodes = enc.transform(X_Encodes) 

X = np.concatenate((X_Encodes, X_values), axis=1)

# Scaling the features
scaler_X = MinMaxScaler()
X = scaler_X.fit_transform(X)

# Scaling the target
scaler_y = MinMaxScaler()
y = scaler_y.fit_transform(y.reshape(-1, 1))

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

#### LinearRegression

In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train.ravel())

# Predict on validation set for Gradient Boosting Regressor
y_val_pred_lr = lr.predict(X_val)

# Metrics
mae_lr = mean_absolute_error(y_val, y_val_pred_lr)
mse_lr = mean_squared_error(y_val, y_val_pred_lr)
r2_lr = r2_score(y_val, y_val_pred_lr)

print("LinearRegression Metrics:")
print("Mean Absolute Error (MAE):", mae_lr)
print("Mean Squared Error (MSE):", mse_lr)
print("R^2 Score:", r2_lr)

LinearRegression Metrics:
Mean Absolute Error (MAE): 0.00011403726328207897
Mean Squared Error (MSE): 6.785461655746554e-06
R^2 Score: 0.43968854057255813


#### Gradient Boosting Regressor

In [None]:
# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=4, random_state=0)
gbr.fit(X_train, y_train.ravel())

In [None]:
# Predict on validation set for Gradient Boosting Regressor
y_val_pred_gbr = gbr.predict(X_val)

# Metrics
mae_gbr = mean_absolute_error(y_val, y_val_pred_gbr)
mse_gbr = mean_squared_error(y_val, y_val_pred_gbr)
r2_gbr = r2_score(y_val, y_val_pred_gbr)

print("Gradient Boosting Regressor Metrics:")
print("Mean Absolute Error (MAE):", mae_gbr)
print("Mean Squared Error (MSE):", mse_gbr)
print("R^2 Score:", r2_gbr)

#### Prediction

In [None]:
'''
Prepairing input X_test
'''
data = pd.read_csv('datasets_hi4/X_test.csv', sep=';')
index = data['index'].values
data = data_preprocess(data)

X_values = data[['Reference proxy', 'Month 1', 'Month 2', 'Month 3']].values
data.drop(columns=['Reference proxy', 'Month 1', 'Month 2', 'Month 3'], inplace=True)

X_Encodes = data.values

# Label encoding
X_Encodes = enc.transform(X_Encodes) 
X_test = np.concatenate((X_Encodes, X_values), axis=1)

# Scaling the features
X_test = scaler_X.transform(X_test)

'''
Prediction
'''
y_test_pred_gbr = gbr.predict(X_test)
y_test_pred_gbr = scaler_y.inverse_transform(y_test_pred_gbr.reshape(1,-1)).reshape(-1) # Scaling back the target

'''
Make submission
'''
print('Generating submission.csv file...')

predictions = y_test_pred_gbr
index = index

# Write the submission file
np.savetxt(
    'submission_'+str(np.datetime64('now'))+'.csv',
    np.rec.fromarrays([index, predictions]),
    fmt=['%d', '%d'],
    delimiter=';',
    header='index; Month 4',
    comments='',
)

# check the csv
!head submission_2023-12-02T12\:00\:03.csv
