# Set up

### Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scikitplot as skplt
import math
import lightgbm as lgbm
import os

from dython import nominal
from sklearn                 import model_selection  as ms
from matplotlib              import pyplot as plt
from sklearn                 import ensemble         as en
from sklearn                 import neighbors        as nh
from sklearn                 import linear_model     as lm
import scikitplot as skplt


# from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score, ndcg_score, classification_report
import lightgbm as lgbm
import lightgbm as lgb
from sklearn.tree import export_graphviz
# import graphviz
from IPython.display         import HTML
from skopt                   import BayesSearchCV
from sklearn.compose         import make_column_transformer, ColumnTransformer
from sklearn.preprocessing   import RobustScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from tabulate                import tabulate
# from boruta                  import BorutaPy
from sklearn.ensemble        import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
# from sklearn                 import model_selection, metrics
from xgboost                 import XGBClassifier
from sklearn.metrics         import make_scorer, accuracy_score
# from sklearn.inspection      import permutation_importance
# from imblearn.pipeline       import Pipeline, make_pipeline
# from category_encoders       import TargetEncoder
# from sklearn.feature_selection import SelectFromModel, RFE
# from sklearn.base           import BaseEstimator, TransformerMixin


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV



### Functions

In [2]:



def ndcg_score(y_true, y_pred, k=None):
    # y_true: lista de relevâncias binárias (1 para relevante, 0 para não relevante)
    # y_pred: lista de pontuações preditas pelo modelo
    # k: número de elementos para considerar no cálculo do NDCG (se for None, usa o tamanho total)

    if k is None:
        k = len(y_true)

    # Ordena as listas de relevâncias e predições pelo valor predito (do maior para o menor)
    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = np.take(y_true, sorted_indices)[:k]

    # Calcula o DCG (Discounted Cumulative Gain)
    dcg = np.sum(y_true_sorted / np.log2(np.arange(2, k + 2)))

    # Ordena as listas de relevâncias idealmente (todas relevâncias relevantes primeiro)
    ideal_sorted_indices = np.argsort(y_true)[::-1]
    ideal_sorted = np.take(y_true, ideal_sorted_indices)[:k]

    # Calcula o IDCG (Ideal Discounted Cumulative Gain)
    idcg = np.sum(ideal_sorted / np.log2(np.arange(2, k + 2)))

    # Calcula o NDCG (Normalized Discounted Cumulative Gain)
    ndcg_scorer = dcg / idcg if idcg > 0 else 0.0

    return ndcg_scorer

def jupyter_settings():
    %matplotlib inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    sns.set()

In [3]:
def preprocessing_cv(X_data, Y_data):  
    
    df_data = pd.concat([X_data, Y_data], axis=1) # juntar as variaveis preditoras com a variavel alvo

    le = LabelEncoder()
    rs = RobustScaler()
    mms = MinMaxScaler()

    df_data['Carrier delay in minutes'] = rs.fit_transform(df_data[['Carrier delay in minutes']].values)
    df_data['Gender'] = rs.fit_transform(df_data[['Gender']].values)
    df_data['sum_delay_in_minute'] = rs.fit_transform(df_data[['sum_delay_in_minute']].values)
    df_data['sub_delay_in_minute'] = rs.fit_transform(df_data[['sub_delay_in_minute']].values)
    df_data['delivery_delay_in_hour'] = rs.fit_transform(df_data[['delivery_delay_in_hour']].values)
    df_data['carrier_delay_in_hour'] = rs.fit_transform(df_data[['carrier_delay_in_hour']].values)
    df_data['carrier_delay_in_min'] = le.fit_transform(df_data['carrier_delay_in_min'])
    df_data['delivery_delay_in_min'] = le.fit_transform(df_data['delivery_delay_in_min'])
    df_data['Gender'] = le.fit_transform(df_data['Gender'])
    df_data['Customer Type'] = le.fit_transform(df_data['Customer Type'])
    df_data['Type of Purchase'] = le.fit_transform(df_data['Type of Purchase'])
    df_data['Store size'] = le.fit_transform(df_data['Store size'])
    df_data['carrier_delay'] = le.fit_transform(df_data['carrier_delay'])
    df_data['delivery_delay'] = le.fit_transform(df_data['delivery_delay'])
    df_data['Satisfaction'] = le.fit_transform(df_data['Satisfaction'])



    # Feature selection
    cols_selected = ['Type of Purchase', 'instore_wifi', 'dressing_room', 'Store size', 'Customer Type', 
                 'waiting_room', 'kids_entertainment','easy_of_online_shopping', 'showroom', 'seller_service', 'cleanliness', 'self_store' ]

    X_data = df_data[cols_selected]
    Y_data = df_data['Satisfaction']

    return X_data, Y_data



def precision_at_k(data, k=2000):
    # reset index
    data = data.reset_index(drop = True)

    # create ranking order
    data['ranking'] = data.index + 1

    data['precision_at_k'] = data['Satisfaction'].cumsum() / data['ranking']

    return data.loc[k, 'precision_at_k']
    
# def recall_at_k(data, k=2000):

#     # reset index
#     data = data.reset_index(drop = True)

#     # create ranking order
#     data['ranking'] = data.index + 1

#     data['recall_at_k'] = data['statisfaction'].cumsum() / data['statisfaction'].sum()

#     return data.loc[k, 'recall_at_k']

In [4]:
# def preprocessing_cv(X_data, Y_data):  
    
#     df_data = pd.concat([X_data, Y_data], axis=1) # juntar as variaveis preditoras com a variavel alvo
    
#     df_data['vintAge'] = mms_vintAge.transform(df_data[['vintAge']].values)
#     df_train['vintAge'] = mms_vintAge.fit_transform(df_train[['vintAge']].values)
    
#     df_data['annual_premium'] = ss.transform(df_data[['annual_premium']].values)
    
#     df_data['Age'] = mms.transform(df_data[['Age']].values)
#     df_train['Age'] = mms_Age.fit_transform(df_train[['Age']].values)
    
#     df_data['region_code'] = df_data['region_code'].astype(str)
#     df_data['region_code'] = target_encode_region.transform(df_data['region_code'])
    
#     df_data['vehicle_damAge'] = df_data['vehicle_damAge'].map(damAge_dict)
    
#     df_data['policy_sales_channel'] = df_data['policy_sales_channel'].map(fe_policy_sales_channel)
    
#     df_data['Gender'] = target_encode_Gender.transform(df_data['Gender'])

#     df_data = pd.get_dummies(df_data, prefix='vehicle_Age', columns=['vehicle_Age'])

#     df_data['Age_group'] = df_data['Age_group'].map(Age_group_dict)
#     df_data = df_data.fillna(0)
    
#     # Feature selection
#     cols_selected = ['vintAge', 'annual_premium', 'Age', 'region_code', 'vehicle_damAge', 
#                  'policy_sales_channel', 'previously_insured']
    
#     X_data = df_data[cols_selected]
#     Y_data = df_data['response']

In [5]:
jupyter_settings()

### Import Data

In [6]:
df_train_raw = pd.read_csv('train.csv', low_memory=False)
df_test_raw = pd.read_csv('test.csv', low_memory=False)


In [7]:
df_test_raw.head(2)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Purchase,Store size,Store distance,InStore wifi,Open/Close time convenient,Easy of online shopping,Store location,Toilet cleaning,Dressing room,Waiting room,Kids entertainment,Seller service,Showroom,Self-Store,Purchase service,Store Service,Cleanliness,Carrier delay in minutes,Delivery delay in minutes
0,19556,Female,Loyal Customer,52,Gift,Medium,160,5,4,3,4,3,4,3,5,5,5,5,2,5,5,50,44.0
1,90035,Female,Loyal Customer,36,Gift,Large,2863,1,1,3,1,5,4,5,4,4,4,4,3,4,5,0,0.0


In [8]:
df_train_raw.head(2)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Purchase,Store size,Store distance,InStore wifi,Open/Close time convenient,Easy of online shopping,Store location,Toilet cleaning,Dressing room,Waiting room,Kids entertainment,Seller service,Showroom,Self-Store,Purchase service,Store Service,Cleanliness,Carrier delay in minutes,Delivery delay in minutes,Satisfaction
0,70172,Male,Loyal Customer,13,Personal,Small,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,Neutral or Dissatisfaction
1,5047,Male,disloyal Customer,25,Gift,Large,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,Neutral or Dissatisfaction


## ETL

### Data Raw status

In [9]:
df1 = df_train_raw.copy()
dfteste = df_test_raw.copy()

#### Dados

In [10]:
df1.isna().sum()

id                              0
Gender                          0
Customer Type                   0
Age                             0
Type of Purchase                0
Store size                      0
Store distance                  0
InStore wifi                    0
Open/Close time convenient      0
Easy of online shopping         0
Store location                  0
Toilet cleaning                 0
Dressing room                   0
Waiting room                    0
Kids entertainment              0
Seller service                  0
Showroom                        0
Self-Store                      0
Purchase service                0
Store Service                   0
Cleanliness                     0
Carrier delay in minutes        0
Delivery delay in minutes     310
Satisfaction                    0
dtype: int64

In [11]:
def data_clean(data):
    data = data[data['Age'] >= 15]

    # Verificando numero de linhas sem NAN
    data_clean = data[~data['Delivery delay in minutes'].isna()]

    return data_clean

In [12]:
df2 = data_clean(df1)
df2test = data_clean(dfteste)

# Descriptive Statistical

### Data Dimensions

In [13]:
print(f'Number of Rows {df2.shape[0]}')
print(f'Number of Columns {df2.shape[1]}')

Number of Rows 98386
Number of Columns 24


# Data Preparation

In [14]:
df4 = df2.copy()

### Split

In [15]:
df2test.shape[0]

24668

In [16]:
df2.shape[0]

98386

In [17]:
# df2test - Teste ver se o valor ta certo no final, compara com o treino e validação
# df2 - Separar em treino e validação.

X = df4.drop(['Satisfaction'], axis=1)
y = df4[['id', 'Satisfaction' ]]

X_test = df2test
y_test = df2test['id']

In [18]:
# Treino(80%) -> validação(20%)
x_train, x_validation, y_train, y_validation = ms.train_test_split(X, y, test_size=0.20) 
df_train = pd.concat([x_train, y_train], axis=1) # juntar as variaveis preditoras com a variavel alvo

### Tranformations

In [19]:
# MinMaxscaler - Age, Carrier delay in minutes, Gender,   sum_delay_in_minute ,sub_delay_in_minute, delivery_delay_in_hour, carrier_delay_in_hour

mms = MinMaxScaler()

df_train['Age'] = mms.fit_transform(df_train[['Age']].values)

In [20]:
df_train.head(1)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Purchase,Store size,Store distance,InStore wifi,Open/Close time convenient,Easy of online shopping,Store location,Toilet cleaning,Dressing room,Waiting room,Kids entertainment,Seller service,Showroom,Self-Store,Purchase service,Store Service,Cleanliness,Carrier delay in minutes,Delivery delay in minutes,id.1,Satisfaction
11518,38321,Male,Loyal Customer,0.5,Gift,Large,2334,1,1,1,1,3,5,4,4,4,4,4,5,4,3,0,12.0,38321,Satisfied


In [21]:
# RobustScaler - Age, Carrier delay in minutes Gender,    sum_delay_in_minute sub_delay_in_minute delivery_delay_in_hour carrier_delay_in_hour

rs = RobustScaler()

df_train['Carrier delay in minutes'] = rs.fit_transform(df_train[['Carrier delay in minutes']].values)
df_train['Delivery delay in minutes'] = rs.fit_transform(df_train[['Delivery delay in minutes']].values)
# df_train['Gender'] = rs.fit_transform(df_train[['Gender']].values)




In [22]:
# Label encoder - Gender, Customer Type, Type of Purchase, Store size, carrier_delay, delivery_delay, carrier_delay_in_min, delivery_delay_in_min

# Crie uma instância do LabelEncoder
le = LabelEncoder()

df_train['Gender'] = le.fit_transform(df_train['Gender'])
df_train['Customer Type'] = le.fit_transform(df_train['Customer Type'])
df_train['Type of Purchase'] = le.fit_transform(df_train['Type of Purchase'])
df_train['Store size'] = le.fit_transform(df_train['Store size'])
df_train['Satisfaction'] = le.fit_transform(df_train['Satisfaction'])

y_train['Satisfaction'] = le.fit_transform(y_train['Satisfaction'])



#### Data Preparation - Validação

In [23]:

rs = RobustScaler()
le = LabelEncoder()
mms = MinMaxScaler()

x_validation['Age'] = mms.fit_transform(x_validation[['Age']].values)

x_validation['Carrier delay in minutes'] = rs.fit_transform(x_validation[['Carrier delay in minutes']].values)
x_validation['Delivery delay in minutes'] = rs.fit_transform(x_validation[['Delivery delay in minutes']].values)

x_validation['Gender'] = le.fit_transform(x_validation['Gender'])
x_validation['Customer Type'] = le.fit_transform(x_validation['Customer Type'])
x_validation['Type of Purchase'] = le.fit_transform(x_validation['Type of Purchase'])
x_validation['Store size'] = le.fit_transform(x_validation['Store size'])






In [24]:
X_test.head(2)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Purchase,Store size,Store distance,InStore wifi,Open/Close time convenient,Easy of online shopping,Store location,Toilet cleaning,Dressing room,Waiting room,Kids entertainment,Seller service,Showroom,Self-Store,Purchase service,Store Service,Cleanliness,Carrier delay in minutes,Delivery delay in minutes
0,19556,Female,Loyal Customer,52,Gift,Medium,160,5,4,3,4,3,4,3,5,5,5,5,2,5,5,50,44.0
1,90035,Female,Loyal Customer,36,Gift,Large,2863,1,1,3,1,5,4,5,4,4,4,4,3,4,5,0,0.0


In [25]:
X_test['Age'] = mms.fit_transform(X_test[['Age']].values)

X_test['Carrier delay in minutes'] = rs.fit_transform(X_test[['Carrier delay in minutes']].values)
X_test['Delivery delay in minutes'] = rs.fit_transform(X_test[['Delivery delay in minutes']].values)


X_test['Gender'] = le.fit_transform(X_test['Gender'])
X_test['Customer Type'] = le.fit_transform(X_test['Customer Type'])
X_test['Type of Purchase'] = le.fit_transform(X_test['Type of Purchase'])
X_test['Store size'] = le.fit_transform(X_test['Store size'])


### Features selection

In [26]:
# model definition
forest = en.ExtraTreesClassifier(n_estimators=250, random_state=0, n_jobs=-1)

# data preparation
x_train_n = df_train.drop(['Satisfaction'], axis=1)
y_train_n = y_train.drop(['id'], axis=1) # Usando a variável dependente codificada

forest.fit(x_train_n, y_train_n)


  forest.fit(x_train_n, y_train_n)


In [27]:
importances = forest.feature_importances_

std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

indices = np.argsort(importances)[::-1]

# print the feature ranking

print('Feature ranking')

df = pd.DataFrame()

for i, j in zip(x_train_n, forest.feature_importances_):
    
    aux = pd.DataFrame({'feature': i, 'importance': j}, index=[0])
    
    df = pd.concat([df, aux], axis=0)
    
print(df.sort_values('importance', ascending=False))

# # plot the impurity-based feature importances of the forest
# plt.figure()
# plt.title('Feature importances')
# plt.bar(range(x_train_n.shape[1]), importances[indices], color='r', yerr=std[indices], align='center')
# plt.xticks(range(x_train_n.shape[1]), indices)
# plt.xlim([-1, x_train_n.shape[1]])
# plt.show()

Feature ranking
                      feature  importance
0            Type of Purchase    0.133047
0                InStore wifi    0.125149
0               Dressing room    0.111891
0                  Store size    0.080935
0               Customer Type    0.056420
0          Kids entertainment    0.053080
0                Waiting room    0.052639
0     Easy of online shopping    0.040619
0              Seller service    0.036652
0                   Showroom     0.036506
0                 Cleanliness    0.033600
0               Store Service    0.030992
0                  Self-Store    0.029425
0            Purchase service    0.029163
0              Store distance    0.023956
0  Open/Close time convenient    0.018722
0                         Age    0.017890
0                          id    0.016903
0              Store location    0.016747
0                          id    0.016564
0             Toilet cleaning    0.013997
0   Delivery delay in minutes    0.009424
0    Carrier delay

# Machine Learning

In [28]:
cols_selected = ['Type of Purchase', 'InStore wifi', 'Dressing room', 'Store size', 'Customer Type', 
                 'Waiting room', 'Kids entertainment','Easy of online shopping', 'Showroom ', 'Seller service', 'Cleanliness', 'Self-Store' ]# ExtraTreesClassifier / LGB

# cols_selected = ['Type of Purchase', 'instore_wifi', 'dressing_room', 'Store size', 'Customer Type', 
#                  'waiting_room', 'kids_entertainment','easy_of_online_shopping', 'showroom', 'seller_service', 'cleanliness', 'self_store', 'store_service', 'purchase_service', 'store_distance', 'Age', 'open/close_time_convenient',  'store_location', 'toilet_cleaning'] # ExtraTreesClassifier com  mais colunas 



x_train = df_train[cols_selected]

df_val = pd.concat([x_validation, y_validation], axis=1)
x_val = df_val[cols_selected]
y_val = y_validation.values

In [29]:
y_val

array([[25489, 'Satisfied'],
       [92623, 'Satisfied'],
       [9789, 'Neutral or Dissatisfaction'],
       ...,
       [114242, 'Satisfied'],
       [92301, 'Satisfied'],
       [78740, 'Satisfied']], dtype=object)

In [30]:
# # KNN

# # model definition
# knn_model = nh.KNeighborsClassifier(n_neighbors = 7)

# # model training
# knn_model.fit(x_train, y_train)

# #model prediction - PODER DE GENERALIZAÇÃO
# yhat_knn = knn_model.predict_proba(x_val)

# yhat_knn_pred = knn_model.predict(x_val)

# accuracy_knn = accuracy_score(y_val, yhat_knn_pred)
# accuracy_knn

In [31]:
# # Logistic Regression

# # model definition
# lr_model = lm.LogisticRegression( random_state = 42)

# # model training 
# lr_model.fit(x_train, y_train)

# # model prediction
# yhat_lr = lr_model.predict_proba(x_val)

# yhat_lr_pred = lr_model.predict(x_val)

# accuracy_lr = accuracy_score(y_val, yhat_knn_pred)
# accuracy_lr

In [33]:
# Random Forest

# model definition
rf_model = en.RandomForestClassifier(n_estimators=250, n_jobs=-1, random_state=42)

# model training
rf_model.fit(x_train, y_train)

# model prediction
yhat_rf = rf_model.predict_proba(x_val)

yhat_rf_pred = rf_model.predict(x_val)

accuracy_rf = accuracy_score(y_val, yhat_rf_pred)
accuracy_rf


MemoryError: could not allocate 5037312 bytes

: 

In [None]:
#LGBM

# Crie o modelo LGBMClassifier e ajuste-o aos dados de treinamento
model_lgbm = lgbm.LGBMClassifier(n_jobs=-1, random_state=42, class_weight='balanced', n_estimators=500)
model_lgbm.fit(x_train, y_train)

# Faça as previsões usando o conjunto de validação
y_predicted_proba_lgbm = model_lgbm.predict_proba(x_val)
y_predicted_lgbm = model_lgbm.predict(x_val)

# Calculando o Kappa Score
kappa_score_lgbm = cohen_kappa_score(y_val, y_predicted_lgbm)

# Imprimir o Kappa Score e o Relatório de Classificação
print("Kappa Score:", kappa_score_lgbm)


Kappa Score: 0.9099432488998977


In [None]:
# # XGBClassifier

# model_xgb = XGBClassifier( n_jobs=-1,  random_state=42, max_delta_step=4, eval_metric='ndcg', objective='multi:softprob' ).fit(x_train, y_train)
# y_predicted_proba_xgb = model_xgb.predict_proba(x_val)
# y_predicted_xgb = model_xgb.predict(x_val)

# y_test_matrix = np.zeros(shape=(x_val.shape[0], y_val.unique().shape[0] ) )
# y_test_matrix[np.arange(x_val.shape[0]), np.array(y_val)] = 1
                
# y_predicted_matrix = np.zeros(shape=(x_val.shape[0], y_val.unique().shape[0]) )
# y_predicted_matrix[np.arange(x_val.shape[0]),y_predicted_xgb] = 1

# ndcg_score_xgb = ndgc(y_true=y_test_matrix,y_score=y_predicted_matrix, k=5) 
# kappa_score_xgb = cohen_kappa_score(y_val, y_predicted_xgb)
# print(ndcg_score_xgb)
# print(kappa_score_xgb)
# print(classification_report(y_val,y_predicted_xgb))

### Cros-validation

# Hyperparameter tunning

In [None]:
cols_selected = ['Type of Purchase', 'instore_wifi', 'dressing_room', 'Store size', 'Customer Type', 
                 'waiting_room', 'kids_entertainment','easy_of_online_shopping', 'showroom', 'seller_service', 'cleanliness', 'self_store' ]

x_val = df_val[cols_selected]
y_val = y_validation.values

In [None]:
param_grid = {
    'n_estimators': [1000,1500, 1700, 2500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [None]:
grid_search = GridSearchCV(en.RandomForestClassifier(),
                            param_grid=param_grid)

grid_search.fit(x_val, y_val)

print(grid_search.best_estimator_)

RandomForestClassifier(max_depth=9, max_leaf_nodes=9, n_estimators=1700)


In [None]:
x_test.head(2)

Unnamed: 0,gender,customer_type,age,type_of_purchase,store_size,store_distance,instore_wifi,open/close_time_convenient,easy_of_online_shopping,store_location,toilet_cleaning,dressing_room,waiting_room,kids_entertainment,seller_service,showroom,self_store,purchase_service,store_service,cleanliness,carrier_delay_in_minutes,delivery_delay_in_minutes,carrier_delay,delivery_delay,carrier_delay_in_min,delivery_delay_in_min,sum_delay_in_minute,sub_delay_in_minute,carrier_delay_in_hour,delivery_delay_in_hour
85520,Male,disloyal Customer,25,Gift,Large,883,4,5,3,4,3,3,3,3,3,2,5,4,5,3,5,0,yes,no,<10,0,5,0,0.083333,0.0
30897,Female,Loyal Customer,22,Gift,Medium,399,4,1,1,1,4,4,4,4,2,5,2,5,4,4,0,0,no,no,0,0,0,0,0.0,0.0


In [None]:
x_test, y_test = preprocessing_cv(x_test, y_test)

In [None]:
X_train, Y_train = preprocessing_cv(X_train, Y_train)

In [None]:
# model definition
rf_model_final = en.RandomForestClassifier(n_estimators=1700, n_jobs=-1, random_state=42, 
                                     max_depth=9, max_leaf_nodes=9)

# model training
rf_model_final.fit(X_train, Y_train)

# model prediction
yhat_rf = rf_model_final.predict_proba(x_test)

yhat_rf_pred = rf_model_final.predict(x_test)

In [None]:
# copy data
df_final = x_test.copy()
df_final['Satisfaction'] = y_test.copy()

# propensity score
df_final['score'] = yhat_rf[:,1].tolist()

# sorte clients by propensity score
df_final = df_final.sort_values('score', ascending= False)

# compute precision at k
precision = precision_at_k(df_final, k=1700)

# accuracy
accuracy = accuracy_score(y_test, yhat_rf_pred)
accuracy


NameError: name 'yhat_rf' is not defined

# Kaggle

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Seus dados de previsão - substitua esses dados pelos seus resultados reais
dados_previstos = yhat_rf_pred

# Crie um DataFrame a partir dos dados previstos
df_previstos = pd.DataFrame(dados_previstos)

# Salve o DataFrame em um arquivo CSV
df_previstos.to_csv('datawarrior_1.csv', index=False)

In [None]:
pred = pd.DataFrame(yhat_rf_pred) 

NameError: name 'yhat_rf_pred' is not defined

In [None]:
y_test

85520     1
30897     1
89820     1
81761     1
53875     0
51452     0
45132     1
21143     0
82456     1
58194     1
40631     0
33367     1
102546    0
78882     0
56487     0
41775     1
3057      0
11573     1
100574    0
33564     1
41026     0
83231     0
5688      1
31016     0
72389     0
57401     1
24490     1
44549     0
8875      0
20691     1
34609     1
3796      1
13550     0
43676     1
67530     0
68521     1
80242     1
89972     1
19567     0
55192     0
22747     0
24424     0
53926     1
51631     0
33188     1
23086     1
11517     0
44023     1
57488     1
63154     1
49146     1
81959     1
29251     1
52801     1
97797     0
78690     1
68093     0
19457     0
79774     1
66344     1
12326     0
55        1
26862     1
50689     0
34904     0
40592     0
72521     1
95359     1
31592     0
14631     0
6597      0
94113     0
15794     0
78414     1
51683     1
72543     1
15789     0
18257     1
38783     1
95164     1
77733     0
92924     0
77499     0
5227

In [None]:

df_previstos.head(3)


Unnamed: 0,satisfaction
85520,1
30897,1
89820,1
