In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import minmax_scale, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [27]:
import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")
x_test = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_test.csv')
x_train.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15799217,Zetticci,791,Germany,Female,35,7,52436.2,1,1,0,161051.75
1,15748986,Bischof,705,Germany,Male,42,8,166685.92,2,1,1,55313.51
2,15722004,Hsiung,543,France,Female,31,4,138317.94,1,0,0,61843.73
3,15780966,Pritchard,709,France,Female,32,2,0.0,2,0,0,109681.29
4,15636731,Ts'ai,714,Germany,Female,36,1,101609.01,2,1,1,447.73


In [2]:
y_train.head()

Unnamed: 0,CustomerId,Exited
0,15799217,0
1,15748986,0
2,15722004,0
3,15780966,0
4,15636731,0


In [4]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       6499 non-null   int64  
 1   Surname          6499 non-null   object 
 2   CreditScore      6499 non-null   int64  
 3   Geography        6499 non-null   object 
 4   Gender           6499 non-null   object 
 5   Age              6499 non-null   int64  
 6   Tenure           6499 non-null   int64  
 7   Balance          6499 non-null   float64
 8   NumOfProducts    6499 non-null   int64  
 9   HasCrCard        6499 non-null   int64  
 10  IsActiveMember   6499 non-null   int64  
 11  EstimatedSalary  6499 non-null   float64
dtypes: float64(2), int64(7), object(3)
memory usage: 609.4+ KB


In [5]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3501 entries, 0 to 3500
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       3501 non-null   int64  
 1   Surname          3501 non-null   object 
 2   CreditScore      3501 non-null   int64  
 3   Geography        3501 non-null   object 
 4   Gender           3501 non-null   object 
 5   Age              3501 non-null   int64  
 6   Tenure           3501 non-null   int64  
 7   Balance          3501 non-null   float64
 8   NumOfProducts    3501 non-null   int64  
 9   HasCrCard        3501 non-null   int64  
 10  IsActiveMember   3501 non-null   int64  
 11  EstimatedSalary  3501 non-null   float64
dtypes: float64(2), int64(7), object(3)
memory usage: 328.3+ KB


In [7]:
print(x_train['Surname'].unique())
print(len(x_train['Surname'].unique()))

print(x_train['Geography'].unique())
print(x_train['Gender'].unique())

['Zetticci' 'Bischof' 'Hsiung' ... 'Rita' 'Coburn' 'Gilleland']
2289
['Germany' 'France' 'Spain']
['Female' 'Male' ' male' 'female']


In [11]:
def transform_gender(x):
    if x == 'female':
        return 'Female'
    elif x == ' male':
        return 'Male'
    return x

In [12]:
x_train['Gender'] = x_train['Gender'].apply(transform_gender)
x_test['Gender'] = x_test['Gender'].apply(transform_gender)

In [13]:
print(x_train['Gender'].unique())

['Female' 'Male']


In [14]:
x_train = x_train.drop(['Surname', 'CustomerId'], axis = 1)
x_test = x_test.drop(['Surname', 'CustomerId'], axis = 1)

### label Encoder

In [16]:
cols = ['Geography', 'Gender']
for col in cols:
    le = LabelEncoder()
    le.fit(x_train[col])
    x_train[col] = le.transform(x_train[col])
    x_test[col] = le.transform(x_test[col])
print(x_train.head())

   CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          791          1       0   35       7   52436.20              1   
1          705          1       1   42       8  166685.92              2   
2          543          0       0   31       4  138317.94              1   
3          709          0       0   32       2       0.00              2   
4          714          1       0   36       1  101609.01              2   

   HasCrCard  IsActiveMember  EstimatedSalary  
0          1               0        161051.75  
1          1               1         55313.51  
2          0               0         61843.73  
3          0               0        109681.29  
4          1               1           447.73  


In [18]:
x_train.Geography.unique()

array([1, 0, 2])

### Model

In [19]:
model = RandomForestClassifier()

params = {'n_estimators': [10, 100],
         'max_depth': [6, 8, 10, 12],
         'min_samples_leaf':[8,12,18],
         'min_samples_split': [8,16,20]}

## Grid Search

grid_cv = GridSearchCV(model, param_grid=params, cv = 3, n_jobs=-1)
grid_cv.fit(x_train, y_train['Exited'])

GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [6, 8, 10, 12],
                         'min_samples_leaf': [8, 12, 18],
                         'min_samples_split': [8, 16, 20],
                         'n_estimators': [10, 100]})

In [20]:
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}
최고 예측 정확도: 0.8620


In [21]:
model = RandomForestClassifier(n_estimators=100, 
                              max_depth=10,
                              min_samples_leaf=8,
                              min_samples_split=20)
model.fit(x_train, y_train['Exited'])
print(model.score(x_train, y_train['Exited']))

0.8809047545776273


## Prediction

In [22]:
prediction = model.predict_proba(x_test)
prediction

array([[0.17698449, 0.82301551],
       [0.07985196, 0.92014804],
       [0.91885147, 0.08114853],
       ...,
       [0.18506746, 0.81493254],
       [0.97616147, 0.02383853],
       [0.94895599, 0.05104401]])

In [25]:
prediction[:,1]

array([0.82301551, 0.92014804, 0.08114853, ..., 0.81493254, 0.02383853,
       0.05104401])

In [28]:
pd.DataFrame({'CustomerID':y_test.CustomerId, 'Exited':prediction[:,1]}).to_csv('33333.csv', index=False)


In [29]:
df = pd.read_csv('33333.csv')
df.head()

Unnamed: 0,CustomerID,Exited
0,15601012,0.823016
1,15734762,0.920148
2,15586757,0.081149
3,15590888,0.166903
4,15726087,0.34226


### 예측결과 평가

In [32]:
from sklearn.metrics import roc_auc_score
print(round(roc_auc_score(y_test['Exited'], prediction[:,1]), 4))

0.8581
