# Bibliotecas utilizadas

In [3]:
#manipular os dados 
import pandas as pd
import numpy as np

#preprocessamento
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

# separação entre train e test
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

#modelos
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

#metrica
from sklearn.metrics import root_mean_squared_error

# Importando os dados

In [4]:
df_train = pd.read_csv("train.csv")

# Recaptulando os dados 
De treinamento e vendo o de teste

In [5]:
df_train

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,13,B,1,1,20,,,368
550064,1006035,P00375436,F,26-35,1,C,3,0,20,,,371
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,,,137
550066,1006038,P00375436,F,55+,1,C,2,0,20,,,365


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [7]:
df_train['Product_Category_1'].value_counts().sort_index()

Product_Category_1
1     140378
2      23864
3      20213
4      11753
5     150933
6      20466
7       3721
8     113925
9        410
10      5125
11     24287
12      3947
13      5549
14      1523
15      6290
16      9828
17       578
18      3125
19      1603
20      2550
Name: count, dtype: int64

Percebemos que os dados do teste não apresentam os outliers que encontramos no de treinamento (categoria 19 e 20) que vimos quando fizemos a EDA.
Porem apresenta a mesma questão de dados faltantes no Product_Category_2 e 3.

# Tratamento

Iremos começar o tratamento, buscando lidar com os dados faltantes, Outliers e tratar os dados categorigos. 

In [8]:
df_train.drop(df_train[(df_train.Product_Category_1 == 19) | (df_train.Product_Category_1 == 20)].index,inplace=True)

In [9]:
df_train['Product_Category_1'].value_counts().sort_index()


Product_Category_1
1     140378
2      23864
3      20213
4      11753
5     150933
6      20466
7       3721
8     113925
9        410
10      5125
11     24287
12      3947
13      5549
14      1523
15      6290
16      9828
17       578
18      3125
Name: count, dtype: int64

Veremos a quantidade e a porcentagem de valores nulos perante os dados de treinamento e de teste das colunas: Product_category_2 e 3.

In [10]:
def nulos_e_porcentagem(df):
    total = df.iloc[:, 9:11].copy().isnull().sum()
    porcentagem = df.iloc[:, 9:11].isnull().sum()/df.iloc[:, 9:11].isnull().count().sort_values(ascending=False)
    dados_nulos = pd.concat([total, porcentagem], axis=1, sort=False, keys=['total', 'porcentagem'])
    dados_nulos = dados_nulos[dados_nulos['porcentagem']!=0]*100
    return dados_nulos

In [11]:
train = nulos_e_porcentagem(df_train)
train

Unnamed: 0,total,porcentagem
Product_Category_2,16948500,31.046042
Product_Category_3,37909400,69.441946


Notasse que a porcentagem perante o dados de treinamento e teste, apresentam uma valor aproximado.
E percebe-se que O product_Category_3 tem uma quantia majoritaria de valores nulos, e a product_Category tem uma quantia consideravel, logo que sabemos que os produtos 
podem ter mutiplas categorias, então as tiraremos

In [12]:
df_train.drop(['Product_Category_3','Product_Category_2', 'User_ID'], axis=1, inplace=True)

Lidaremos com a coluna <age> que se encontra como object os substituindos pela media.

In [13]:
media_idade = {'0-17':17, '18-25':21, '26-35':30, '36-45':40, '46-50':48, '51-55':53, '55+':60}
df_train["Age"] = df_train["Age"].apply(lambda x: media_idade[x])

In [14]:
df_train['Stay_In_Current_City_Years'] = df_train['Stay_In_Current_City_Years'].replace('4+',5,).astype(int)

Adicionando coluna referente a quantidade de produtos e retirando o Product_id

In [15]:
df_train['Product_Count'] = df_train.Product_ID.groupby(df_train.Product_ID).transform('count')

In [16]:
df_train.drop(['Product_ID'],axis=1, inplace=True)

Separando em trainamento e test para minimizar chances de data leakage, e utilizando pipeline para StandartScaler e OneHotEncoder 

In [17]:
X = df_train.drop(['Purchase'],axis=1)
y = df_train['Purchase']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((382140, 8), (163775, 8), (382140,), (163775,))

In [20]:
X_train 

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Count
118649,M,21,4,B,5,0,5,816
251664,M,30,0,C,1,1,1,466
295426,M,21,4,C,2,0,8,1249
269977,M,40,7,C,5,0,3,594
48995,M,53,11,C,3,1,1,1281
...,...,...,...,...,...,...,...,...
110268,M,48,17,C,2,0,5,73
259178,M,30,14,C,2,1,5,370
365838,F,21,1,B,1,0,5,652
131932,F,21,4,B,2,0,8,461


In [21]:
numeric_columns = df_train.select_dtypes(exclude='object').drop(columns=['Purchase'], errors='ignore').columns
numeric_columns

Index(['Age', 'Occupation', 'Stay_In_Current_City_Years', 'Marital_Status',
       'Product_Category_1', 'Product_Count'],
      dtype='object')

In [None]:
#Para os dados numericos 
numeric_transform = Pipeline(steps=[
    ('scaler', StandardScaler())
])

#Para os dados categoricos
categoric_transform = Pipeline(steps=[
    ('hot_encoding', OneHotEncoder(handle_unknown='ignore'))
])

#Para transformar as colunas a partir dos passoas acima
preprocessing = ColumnTransformer(transformers=[
    ('num',numeric_transform, numeric_columns),
    ('cat',categoric_transform, ['Gender', 'City_Category'])    
])

# Utilizando 
X_train_transformed = preprocessing.fit_transform(X_train)
X_test_transformed = preprocessing.transform(X_test)

# Modelagem


In [24]:
y_grand = GradientBoostingRegressor().fit(X_train_transformed, y_train)
y_linear = LinearRegression().fit(X_train_transformed, y_train)

In [None]:
modelo_xbg =  XGBRegressor()

In [35]:
modelo_xbg.fit(X_train_transformed, y_train)

In [36]:
y_xgbost_predict = modelo_xbg.predict(X_test_transformed)

In [28]:
y_grand_predict = y_grand.predict(X_test_transformed)
y_linear_predict = y_linear.predict(X_test_transformed)

In [37]:
print(root_mean_squared_error(y_grand_predict, y_test)), print(root_mean_squared_error(y_linear_predict, y_test)), print(root_mean_squared_error(y_xgbost_predict, y_test))

2868.4028518064347
4540.989098278296
2705.6498155353343


(None, None, None)

# Verificando os parametros

In [None]:
#selecionando parametros 
params = {
    'n_estimators' : [100,200,300,400], 
    'max_depth' : [7,9],
    'min_child_weight': [7,9], 
    'gamma': [0.1, 0.3]
}

modelo = XGBRegressor(objective ='reg:squarederror', n_jobs=4)

# Definindo k
kfold = KFold(3, shuffle=True, random_state = 42)

# Testando a combinação de parâmetros
grid = RandomizedSearchCV(modelo, params, n_iter=30, cv=kfold, scoring='neg_mean_squared_error', n_jobs=-1)
grid_result = grid.fit(X_train_transformed, y_train)

# Print do resultado
print("Grid scores on development set:")
means = grid.cv_results_['mean_test_score'].round(5)
stds = grid.cv_results_['std_test_score'].round(5)

for mean, std, params in zip((means), stds, grid.cv_results_['params']):
    print(f'mean:{mean},std:{std},params:{params}')
print()
print(f'Melhor parâmetro:{grid.best_params_}, Score:{grid.best_score_}')

'params = {\n    \'n_estimators\' : [100,200,300,400], \n    \'max_depth\' : [7,9],\n    \'min_child_weight\': [7,9], \n    \'gamma\': [0.1, 0.3]\n}\n\nmodelo = XGBRegressor(objective =\'reg:squarederror\', n_jobs=4)\n\n# Definindo k\nkfold = KFold(3, shuffle=True, random_state = 42)\n\n# Testando a combinação de parâmetros\ngrid = RandomizedSearchCV(modelo, params, n_iter=30, cv=kfold, scoring=\'neg_mean_squared_error\', n_jobs=-1)\ngrid_result = grid.fit(X_train_transformed, y_train)\n\n# Print do resultado\nprint("Grid scores on development set:")\nmeans = grid.cv_results_[\'mean_test_score\'].round(5)\nstds = grid.cv_results_[\'std_test_score\'].round(5)\n\nfor mean, std, params in zip((means), stds, grid.cv_results_[\'params\']):\n    print(f\'mean:{mean},std:{std},params:{params}\')\nprint()\nprint(f\'Melhor parâmetro:{grid.best_params_}, Score:{grid.best_score_}\') '

# Modelo final

In [38]:
modelo_xbg =  XGBRegressor( objective ='reg:squarederror',n_estimators= 200, min_child_weight= 9, max_depth= 7, gamma= 0.3)
modelo_xbg.fit(X_train_transformed, y_train)

In [39]:
y_xgbost_predict = modelo_xbg.predict(X_test_transformed)
print(root_mean_squared_error(y_xgbost_predict, y_test))

2653.1015380529184
