In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

churn = pd.read_csv('Churn_Modelling.csv', index_col=0) # 파일 읽기

In [2]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(8), object(3)
memory usage: 1.1+ MB


In [3]:
print(churn.shape)

(10000, 13)


In [4]:
churn.describe()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [5]:
churn.hist(figsize=(20,15))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000267E033E888>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000267E06FC208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000267E0732F88>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000267E07700C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000267E07A8208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000267E07DF308>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000267E0819408>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000267E0851548>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000267E085D148>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000267E0895348>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000267E08F8808>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000267E0930A08>]],
      dtype=object)

In [6]:
corr_matrix = churn.corr()
corr_matrix["Exited"].sort_values(ascending=False)

Exited             1.000000
Age                0.285323
Balance            0.118533
EstimatedSalary    0.012097
CustomerId        -0.006248
HasCrCard         -0.007138
Tenure            -0.014001
CreditScore       -0.027094
NumOfProducts     -0.047820
IsActiveMember    -0.156128
Name: Exited, dtype: float64

In [7]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [8]:

churn_labels = churn["Exited"].copy() 
churn = churn.drop("Exited", axis=1)
churn = churn.drop("Surname", axis=1) 
cat_data = churn.select_dtypes(include=['object']) 
num_data = churn.select_dtypes(include=['int64','float64']) 

num_attribs = list(num_data)    
cat_attribs = list(cat_data) 

print(num_attribs, len(num_attribs), len(cat_attribs))
print(cat_attribs)

churn = pd.concat([cat_data,num_data],axis=1) 

num_pipeline = Pipeline([ 
                ('selector', DataFrameSelector(num_attribs)), 
                ('imputer', Imputer(strategy="median")),
                ('std_scaler', StandardScaler()), 
            ])

cat_pipeline = Pipeline([
                ('selector', DataFrameSelector(cat_attribs)), 
                ('cat_encoder', OneHotEncoder(sparse=False)), 
                ('std_scaler', StandardScaler()), 
            ]) 

full_pipeline = ColumnTransformer([ 
                ("num", num_pipeline, num_attribs), 
                ("cat", cat_pipeline, cat_attribs),
            ])

churn_prepared = full_pipeline.fit_transform(churn) 

print(churn_prepared.shape)

cdata_X = churn_prepared
cdata_Y = churn_labels

['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] 9 2
['Geography', 'Gender']
(10000, 14)




In [9]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 11 columns):
Geography          10000 non-null object
Gender             10000 non-null object
CustomerId         10000 non-null int64
CreditScore        10000 non-null int64
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
dtypes: float64(2), int64(7), object(2)
memory usage: 937.5+ KB


In [10]:
train_X, test_X, train_Y, test_Y = train_test_split(cdata_X, cdata_Y, test_size=0.2, random_state=42)
print(train_X.shape, test_X.shape)
print(train_Y.shape, test_Y.shape)

(8000, 14) (2000, 14)
(8000,) (2000,)


In [11]:
lin_reg = LinearRegression()
lin_reg.fit(train_X, train_Y)
churn_predict = lin_reg.predict(train_X)

In [12]:
some_data = train_X[:5]
some_labels = train_Y[:5]

print("예측 : ",lin_reg.predict(some_data))
print("라벨 : ",list(some_labels))

예측 :  [-0.03640617  0.23971688  0.08761727  0.18527352  0.43161141]
라벨 :  [0, 0, 1, 1, 1]


In [13]:
lin_mse = mean_squared_error(train_Y, churn_predict)
lin_rmse = np.sqrt(lin_mse)
print("RMSE(훈련) : ",lin_rmse)

RMSE(훈련) :  0.3727433763055691


In [14]:
lin_reg = LinearRegression()
lin_reg.fit(test_X, test_Y)
churn_predict = lin_reg.predict(test_X)

In [15]:
some_data = test_X[:5]
some_labels = test_Y[:5]

print("예측 : ",lin_reg.predict(some_data))
print("라벨 : ",list(some_labels))

예측 :  [0.22689551 0.09798926 0.32064551 0.40658301 0.07064551]
라벨 :  [0, 0, 0, 0, 0]


In [16]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(test_Y, churn_predict)
lin_rmse = np.sqrt(lin_mse)
print("RMSE(테스트) : ",lin_rmse)

RMSE(테스트) :  0.36530560123008954


In [17]:
# 선형회귀 교차 검증
scores = cross_val_score(lin_reg, train_X, train_Y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [18]:
def display_scores(scores):
    print("점수 : ", scores)
    print("평균 : ", scores.mean())
    print("표준 편차 : ", scores.std())
    
display_scores(rmse_scores)

점수 :  [0.37112771 0.37347428 0.38831748 0.37933653 0.37066135 0.34958235
 0.3723065  0.39391992 0.37442067 0.359327  ]
평균 :  0.37324737714748374
표준 편차 :  0.01207826212633537


In [19]:
def cross_scores_clf(data, label, model):
    scores_clf = cross_val_score(model, data, label, cv=10, scoring="accuracy")
    
    print("점수 : ", scores_clf)
    print("평균 : ", scores_clf.mean())
    print("표준 편차 : ", scores_clf.std())

In [35]:
forest_clf = RandomForestClassifier(n_estimators = 100, random_state=42, 
                                    n_jobs=-1, max_features=12)
forest_clf.fit(train_X, train_Y) 
r_model = forest_clf
f_data = r_model.predict(train_X)

forest_reg = RandomForestRegressor()
forest_reg.fit(train_X,train_Y)
f_predictions = forest_reg.predict(train_X)
forest_mse = mean_squared_error(train_Y, f_data)
forest_rmse = np.sqrt(forest_mse)
print("점수 : ", forest_clf.score(train_X,train_Y))

print("점수 : ", forest_reg.score(train_X,train_Y))



점수 :  0.999875
점수 :  0.8708425780035799


In [26]:
print("====== 학습 데이터 ======")
print("예측 : ", f_data[:5])
print("레이블 : ", list(train_Y[:5]))

print("\n")

print("정확도 : ", accuracy_score(train_Y, f_data))
print("정밀도 : ", precision_score(train_Y, f_data))
print("재현율 : ", recall_score(train_Y, f_data))
print("F1 Score : ", f1_score(train_Y, f_data))

print("\n")

cross_scores_clf(train_X,f_data,r_model)

예측 :  [0 0 1 1 1]
레이블 :  [0, 0, 1, 1, 1]


정확도 :  0.999875
정밀도 :  0.9993920972644377
재현율 :  1.0
F1 Score :  0.9996959562176954


점수 :  [0.85642946 0.84519351 0.85268414 0.85393258 0.8639201  0.86357947
 0.8660826  0.84230288 0.85356696 0.84605757]
평균 :  0.8543749287108261
표준 편차 :  0.007874282465989054


In [37]:
f2_data = r_model.predict(test_X)

In [38]:
print("====== 테스트 데이터 ======")
print("예측 : ", f2_data[:5])
print("레이블 : ", list(test_Y[:5]))

print("\n")

print("정확도 : ", accuracy_score(test_Y, f2_data))
print("정밀도 : ", precision_score(test_Y, f2_data))
print("재현율 : ", recall_score(test_Y, f2_data))
print("F1 Score : ", f1_score(test_Y, f2_data))

print("\n")


print("점수 : ", forest_clf.score(test_X,test_Y))

print("점수 : ", forest_reg.score(test_X,test_Y))

예측 :  [0 0 0 0 0]
레이블 :  [0, 0, 0, 0, 0]


정확도 :  0.863
정밀도 :  0.7195571955719557
재현율 :  0.4961832061068702
F1 Score :  0.5873493975903614


점수 :  0.863
점수 :  0.27347118443324436


In [40]:
svm_clf = Pipeline([
    ("scaler",StandardScaler()),
    ("linear_svc",LinearSVC(C=100,loss="hinge",random_state=42)),
])

svm_clf.fit(train_X,train_Y)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('linear_svc',
                 LinearSVC(C=100, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='hinge', max_iter=1000, multi_class='ovr',
                           penalty='l2', random_state=42, tol=0.0001,
                           verbose=0))],
         verbose=False)

In [41]:
svm_train = svm_clf.predict(train_X)
svm_test = svm_clf.predict(test_X)

In [42]:
print("====== 학습 데이터 ======")
print("예측 : ", svm_train[:5])
print("레이블 : ", list(train_Y[:5]))

print("\n")

print("정확도 : ", accuracy_score(train_Y, svm_train))
print("정밀도 : ", precision_score(train_Y, svm_train))
print("재현율 : ", recall_score(train_Y, svm_train))
print("F1 Score : ", f1_score(train_Y, svm_train))

print("\n")

cross_scores_clf(train_X,svm_train,r_model)

예측 :  [0 0 0 0 0]
레이블 :  [0, 0, 1, 1, 1]


정확도 :  0.79725
정밀도 :  0.5662650602409639
재현율 :  0.057177615571776155
F1 Score :  0.10386740331491713


점수 :  [0.98751561 0.98626717 0.99126092 0.98252185 0.985      0.97875
 0.98498123 0.98748436 0.98998748 0.98873592]
평균 :  0.9862504529303953
표준 편차 :  0.003488624855736379
점수 :  0.79725


In [43]:
print("====== 테스트 데이터 ======")
print("예측 : ", svm_test[:5])
print("레이블 : ", list(test_Y[:5]))

print("\n")

print("정확도 : ", accuracy_score(test_Y, svm_test))
print("정밀도 : ", precision_score(test_Y, svm_test))
print("재현율 : ", recall_score(test_Y, svm_test))
print("F1 Score : ", f1_score(test_Y, svm_test))

print("\n")

예측 :  [0 0 0 0 0]
레이블 :  [0, 0, 0, 0, 0]


정확도 :  0.8075
정밀도 :  0.5909090909090909
재현율 :  0.06615776081424936
F1 Score :  0.11899313501144164


점수 :  0.8075


In [None]:
# ploynomialFeatures 변환기
polynomial_svm_clf = Pipeline([
    ("poly_features",PolynomialFeatures(degree=3)),
    ("scaler",StandardScaler()),
    ("svm_clf",LinearSVC(C=10, loss="hinge"))
])

polynomial_svm_clf.fit(train_X,train_Y)

In [None]:
poly_svm_train = polynomial_svm_clf.predict(train_X)
poly_svm_test = polynomial_svm_clf.predict(test_X)

In [None]:
print("====== 학습 데이터 ======")
print("예측 : ", poly_svm_train[:5])
print("레이블 : ", list(train_Y[:5]))

print("\n")

print("정확도 : ", accuracy_score(train_Y, poly_svm_train))
print("정밀도 : ", precision_score(train_Y, poly_svm_train))
print("재현율 : ", recall_score(train_Y, poly_svm_train))
print("F1 Score : ", f1_score(train_Y, poly_svm_train))

print("\n")

cross_scores_clf(train_X,poly_svm_train,r_model)

In [None]:
print("====== 테스트 데이터 ======")
print("예측 : ", poly_svm_test[:5])
print("레이블 : ", list(test_Y[:5]))

print("\n")

print("정확도 : ", accuracy_score(test_Y, poly_svm_test))
print("정밀도 : ", precision_score(test_Y, poly_svm_test))
print("재현율 : ", recall_score(test_Y, poly_svm_test))
print("F1 Score : ", f1_score(test_Y, poly_svm_test))

print("\n")