In [2]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression 
from pyspark.sql.functions import col,stddev_samp

In [3]:
# read data  - 수집
# data preprocessing  -EDA 
# model create or choice
# training
# predict(classification / regression)
# score
# 배포 - 엔지니어링

# 데이터를 수집

In [4]:
df = spark.read.format('csv')\
    .options(header = 'true', inferSchema = 'true')\
    .load('Default.csv').drop('_c0').cache()

                                                                                

In [5]:
df.show(2)

[Stage 2:>                                                          (0 + 1) / 1]

+-------+-------+-----------+-----------+
|default|student|    balance|     income|
+-------+-------+-----------+-----------+
|     No|     No|729.5264952|44361.62507|
|     No|    Yes|817.1804066| 12106.1347|
+-------+-------+-----------+-----------+
only showing top 2 rows



                                                                                

In [45]:
df.select("student").distinct().show()
df.groupBy("default").count().show()

+-------+
|student|
+-------+
|     No|
|    Yes|
+-------+

+-------+-----+
|default|count|
+-------+-----+
|     No| 9667|
|    Yes|  333|
+-------+-----+



# 범주형 데이터에 대한 전처리 준비

In [7]:
params = {
    'inputCol' : 'student',
    'outputCol' : 'studentIdx'
}
strIdx = StringIndexer(**params)

# **은 언패킹 
#strIdx = StringIndexer(**params)
#strIdx = StringIndexer(inputCol = 'student', outputCol = 'studentIdx')
#위 두개 코드는 동일

In [8]:
params = {
    'inputCol' : 'default',
    'outputCol' : 'label'
}
label_strIdx = StringIndexer(**params)

In [9]:
# onehot 수치형 데이터에 대해서 처리
#원핫 인코딩(One-Hot Encoding)은 범주형 데이터를 이진 형태로 변환하는 방법 각 범주를 대표하는 새로운 열이 생성되고, 해당 범주에 해당하는 경우 1, 그렇지 않은 경우 0으로 표시
# 예)의류 사이즈 2XL,XL, L, M, S --> 라벨링 작업 필요(숫자형 데이터로 바꿔줘야함)labeling(4,3,2,1,0) --> onehot 적용 시 (10000, 01000, 00010, 00001)
params = {
    'inputCol' : 'studentIdx',
    'outputCol' : 'studentClassVec'
}
encode = OneHotEncoder(**params)

In [10]:
stage = [strIdx, encode, label_strIdx]
stage

[StringIndexer_fd03afd3b2d8,
 OneHotEncoder_c97c316c306c,
 StringIndexer_1b4ae3a668ba]

# 연속형 데이터에 대한 전처리 준비

In [11]:
numCols = ['income','balance']
#scaling
for c in numCols:
    df = df.withColumn(c+'Scaled', col(c) / df.agg(stddev_samp(c)).first()[0])

In [12]:
df.show(2)

+-------+-------+-----------+-----------+------------------+------------------+
|default|student|    balance|     income|      incomeScaled|     balanceScaled|
+-------+-------+-----------+-----------+------------------+------------------+
|     No|     No|729.5264952|44361.62507|3.3262970676634867|1.5081742710178534|
|     No|    Yes|817.1804066| 12106.1347|0.9077350139857981|1.6893841034192338|
+-------+-------+-----------+-----------+------------------+------------------+
only showing top 2 rows



In [13]:
inputs = ['studentClassVec', 'incomeScaled', 'balanceScaled']

In [14]:
assembler = VectorAssembler(inputCols=inputs, outputCol='features')
stage +=[assembler]
stage

[StringIndexer_fd03afd3b2d8,
 OneHotEncoder_c97c316c306c,
 StringIndexer_1b4ae3a668ba,
 VectorAssembler_6bfc00ac2931]

# 학습용, 평가용 데이터 준비 끝
결측치, 이상치, 피처엔지니어링과 같은 고급 기법은 적용안함
스케일만 맞춤

In [17]:
#pipe line 
pipeline = Pipeline(stages=stage)
piplineModel = pipeline.fit(df)
dataset = piplineModel.transform(df)

                                                                                

In [22]:
dataset.show(5)

+-------+-------+-----------+-----------+------------------+------------------+----------+---------------+-----+--------------------+
|default|student|    balance|     income|      incomeScaled|     balanceScaled|studentIdx|studentClassVec|label|            features|
+-------+-------+-----------+-----------+------------------+------------------+----------+---------------+-----+--------------------+
|     No|     No|729.5264952|44361.62507|3.3262970676634867|1.5081742710178534|       0.0|  (1,[0],[1.0])|  0.0|[1.0,3.3262970676...|
|     No|    Yes|817.1804066| 12106.1347|0.9077350139857981|1.6893841034192338|       1.0|      (1,[],[])|  0.0|[0.0,0.9077350139...|
|     No|     No|1073.549164|31767.13895|2.3819447770614217|2.2193837214557224|       0.0|  (1,[0],[1.0])|  0.0|[1.0,2.3819447770...|
|     No|     No|529.2506047|35704.49394|2.6771731943459827|1.0941372934102322|       0.0|  (1,[0],[1.0])|  0.0|[1.0,2.6771731943...|
|     No|     No|785.6558829|38463.49588|2.8840470419162356|1.

In [23]:
#train, test, split, 

In [31]:
# 7: 3 
# 데이터 중 7은 훈련을 시키고 3은 테스트를 시킴
# 보통을 7.5 : 2.5
# seed 고정시킴 
(train, test) = dataset.randomSplit([0.7,0.3], seed=14)

In [32]:
train.count()

                                                                                

6907

In [33]:
test.count()

3093

In [34]:
dataset.count()

10000

# 적절한 모델준비

In [35]:
# label --> 알고자 하는 것
# features 다 모아 둔 것
lr = LogisticRegression(labelCol='label', featuresCol='features', maxIter=10)

In [37]:
lrModel = lr.fit(train) #훈련
predictions = lrModel.transform(test) #예측
predictions.show()

+-------+-------+-------+-----------+------------------+-------------+----------+---------------+-----+--------------------+--------------------+--------------------+----------+
|default|student|balance|     income|      incomeScaled|balanceScaled|studentIdx|studentClassVec|label|            features|       rawPrediction|         probability|prediction|
+-------+-------+-------+-----------+------------------+-------------+----------+---------------+-----+--------------------+--------------------+--------------------+----------+
|     No|     No|    0.0|16601.63528|1.2448139729585133|          0.0|       0.0|  (1,[0],[1.0])|  0.0|[1.0,1.2448139729...|[10.6128038095014...|[0.99997540158156...|       0.0|
|     No|     No|    0.0|16834.80271|1.2622971949428254|          0.0|       0.0|  (1,[0],[1.0])|  0.0|[1.0,1.2622971949...|[10.6130159125976...|[0.99997540679828...|       0.0|
|     No|     No|    0.0|17059.36832| 1.279135440360174|          0.0|       0.0|  (1,[0],[1.0])|  0.0|[1.0,1.

In [38]:
predictions.select('default','prediction').show(5)

+-------+----------+
|default|prediction|
+-------+----------+
|     No|       0.0|
|     No|       0.0|
|     No|       0.0|
|     No|       0.0|
|     No|       0.0|
+-------+----------+
only showing top 5 rows



# 평가

In [46]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# 이진븐류로 평가

In [47]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
evaluator.evaluate(predictions)

                                                                                

0.9549682684102574

In [60]:
predictions.createOrReplaceTempView('predic')

In [61]:
predictions.count()

3093

In [57]:
spark.sql("select * from predic where label != prediction").count()

98

# 위와 같은 방식의 평가가 과연 일반화 시킬 수 있을까
- 교차검증을 통해 신뢰성을 확보할 수 있음
- 각종 파라미터의 값이 변경됨에 따라서 모델 성능이 달라짐 -> 하이퍼 파라미터 튜닝 기법