# Bibliotecas utilizadas

In [39]:
import pandas as pd

#preprocessamento
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

# seleção do modelo
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV


#modelos
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

#metrica
from sklearn.metrics import root_mean_squared_error

# Importando os dados

In [2]:
df_train = pd.read_csv("train.csv")

# Recaptulando os dados 
De treinamento e vendo o de teste

In [3]:
df_train

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,13,B,1,1,20,,,368
550064,1006035,P00375436,F,26-35,1,C,3,0,20,,,371
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,,,137
550066,1006038,P00375436,F,55+,1,C,2,0,20,,,365


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [5]:
df_train['Product_Category_1'].value_counts().sort_index()

Product_Category_1
1     140378
2      23864
3      20213
4      11753
5     150933
6      20466
7       3721
8     113925
9        410
10      5125
11     24287
12      3947
13      5549
14      1523
15      6290
16      9828
17       578
18      3125
19      1603
20      2550
Name: count, dtype: int64

Percebemos que os dados do teste não apresentam os outliers que encontramos no de treinamento (categoria 19 e 20) que vimos quando fizemos a EDA.
Porem apresenta a mesma questão de dados faltantes no Product_Category_2 e 3.

# Tratamento

Iremos começar o tratamento, buscando lidar com os dados faltantes, Outliers e tratar os dados categorigos. 

In [6]:
df_train.drop(df_train[(df_train.Product_Category_1 == 19) | (df_train.Product_Category_1 == 20)].index,inplace=True)

In [7]:
df_train['Product_Category_1'].value_counts().sort_index()


Product_Category_1
1     140378
2      23864
3      20213
4      11753
5     150933
6      20466
7       3721
8     113925
9        410
10      5125
11     24287
12      3947
13      5549
14      1523
15      6290
16      9828
17       578
18      3125
Name: count, dtype: int64

Veremos a quantidade e a porcentagem de valores nulos perante os dados de treinamento e de teste das colunas: Product_category_2 e 3.

In [8]:
def nulos_e_porcentagem(df):
    total = df.iloc[:, 9:11].copy().isnull().sum()
    porcentagem = df.iloc[:, 9:11].isnull().sum()/df.iloc[:, 9:11].isnull().count().sort_values(ascending=False)
    dados_nulos = pd.concat([total, porcentagem], axis=1, sort=False, keys=['total', 'porcentagem'])
    dados_nulos = dados_nulos[dados_nulos['porcentagem']!=0]*100
    return dados_nulos

In [9]:
train = nulos_e_porcentagem(df_train)
train

Unnamed: 0,total,porcentagem
Product_Category_2,16948500,31.046042
Product_Category_3,37909400,69.441946


Notasse que a porcentagem perante o dados de treinamento e teste, apresentam uma valor aproximado.
E percebe-se que O product_Category_3 tem uma quantia majoritaria de valores nulos, logo que sabemos que os produtos 
podem ter mutiplas categorias, e a coluna 

In [10]:
df_train.drop(['Product_Category_3','Product_Category_2', 'User_ID'], axis=1, inplace=True)

Lidaremos com a coluna <age> que se encontra como object os substituindos pela media.

In [11]:
media_idade = {'0-17':17, '18-25':21, '26-35':30, '36-45':40, '46-50':48, '51-55':53, '55+':60}
df_train["Age"] = df_train["Age"].apply(lambda x: media_idade[x])

In [12]:
df_train['Stay_In_Current_City_Years'] = df_train['Stay_In_Current_City_Years'].replace('4+',5,).astype(int)

Agora iremos 

In [13]:
df_train['Product_Count'] = df_train.Product_ID.groupby(df_train.Product_ID).transform('count')

In [14]:
df_train.drop(['Product_ID'],axis=1, inplace=True)

In [15]:
df_train

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Purchase,Product_Count
0,F,17,10,A,2,0,3,8370,227
1,F,17,10,A,2,0,1,15200,581
2,F,17,10,A,2,0,12,1422,102
3,F,17,10,A,2,0,12,1057,341
4,M,60,16,C,5,0,8,7969,203
...,...,...,...,...,...,...,...,...,...
545910,M,30,6,B,2,0,8,9855,125
545911,M,30,6,B,2,0,5,1962,184
545912,M,30,6,B,2,0,8,7852,187
545913,M,30,6,B,2,0,5,7159,905


In [16]:
X = df_train.drop(['Purchase'],axis=1)
y = df_train['Purchase']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((382140, 8), (163775, 8), (382140,), (163775,))

In [23]:
X_train 

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Count
118649,M,21,4,B,5,0,5,816
251664,M,30,0,C,1,1,1,466
295426,M,21,4,C,2,0,8,1249
269977,M,40,7,C,5,0,3,594
48995,M,53,11,C,3,1,1,1281
...,...,...,...,...,...,...,...,...
110268,M,48,17,C,2,0,5,73
259178,M,30,14,C,2,1,5,370
365838,F,21,1,B,1,0,5,652
131932,F,21,4,B,2,0,8,461


In [35]:
numeric_columns = df_train.select_dtypes(exclude='object').drop(columns=['Purchase'], errors='ignore').columns
numeric_columns

Index(['Age', 'Occupation', 'Stay_In_Current_City_Years', 'Marital_Status',
       'Product_Category_1', 'Product_Count'],
      dtype='object')

In [37]:
numeric_transform = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categoric_transform = Pipeline(steps=[
    ('hot_encoding', OneHotEncoder(handle_unknown='ignore'))
])

preprocessing = ColumnTransformer(transformers=[
    ('num',numeric_transform, numeric_columns),
    ('cat',categoric_transform, ['Gender', 'City_Category'])    
])

X_train_transformed = preprocessing.fit_transform(X_train)
X_test_transformed = preprocessing.transform(X_test)

In [None]:
modelos = []
modelos.append(('Linear',Pipeline(steps=[('linear', LinearRegression())])))
modelos.append(('Xgboost', Pipeline(steps=[('xgb', XGBRegressor())])))
modelo.append(('Gradiend', Pipeline(steps=[('Grad', GradientBoostingRegressor())])))

array([[-1.21299278, -0.62370757,  1.90593327, ...,  0.        ,
         1.        ,  0.        ],
       [-0.39162129, -1.23736181, -0.64742955, ...,  0.        ,
         0.        ,  1.        ],
       [-1.21299278, -0.62370757, -0.00908885, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-1.21299278, -1.08394825, -0.64742955, ...,  0.        ,
         1.        ,  0.        ],
       [-1.21299278, -0.62370757, -0.00908885, ...,  0.        ,
         1.        ,  0.        ],
       [-1.21299278, -0.93053469, -0.00908885, ...,  1.        ,
         0.        ,  0.        ]], shape=(382140, 11))