In [1]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression 
from pyspark.sql.functions import col,stddev_samp

In [2]:
# read data  - 수집
# data preprocessing  -EDA 
# model create or choice
# training
# predict(classification / regression)
# score
# 배포 - 엔지니어링

In [2]:
pwd

'/root/spark'

In [4]:
df = spark.read.format('csv')\
    .options(header = 'true', inferSchema = 'true')\
    .load('default.csv').drop('_c0').cache()

                                                                                

In [9]:
df.show(2)

+-------+-------+-----------+-----------+
|default|student|    balance|     income|
+-------+-------+-----------+-----------+
|     No|     No|729.5264952|44361.62507|
|     No|    Yes|817.1804066| 12106.1347|
+-------+-------+-----------+-----------+
only showing top 2 rows



In [8]:

df.select("student").distinct().show()

+-------+
|student|
+-------+
|     No|
|    Yes|
+-------+



In [11]:
#  **params  # unpacking
#  strIdx = StringIndexer(inputCol = 'student', outputCol='studentIdx')
###########################################################################

params = {
    'inputCol':'student',
    'outputCol':'studentIdx'
}
strIdx = StringIndexer(**params)

In [15]:
params = {
    'inputCol':'default',
    'outputCol':'label'
}
label_strIdx =  StringIndexer(**params)

In [16]:
# 수치형 데이터에 대해서 
# 의류사이크 2XL XL L M S --> labeling(4,3,2,1,0) ->onehot(10000,01000,00100,00010,00001)  : 이 경우 onehot으로 하면 안됨
params = {
    'inputCol':'studentIdx',
    'outputCol':'studentClassVec'
}
encode = OneHotEncoder(**params)

In [25]:
stage = [strIdx,encode,label_strIdx]

### 범주형데이터에 대한 전처리준비 끝....

### 연속형 데이터 / 수치형 데이터

In [22]:
numCols = ['income','balance']
# scaling
for c in numCols:
    df = df.withColumn(c+'Scaled', col(c) / df.agg(stddev_samp(c)).first()[0])

In [23]:
df.show(2)

+-------+-------+-----------+-----------+------------------+------------------+
|default|student|    balance|     income|      incomeScaled|     balanceScaled|
+-------+-------+-----------+-----------+------------------+------------------+
|     No|     No|729.5264952|44361.62507|3.3262970676634867|1.5081742710178534|
|     No|    Yes|817.1804066| 12106.1347|0.9077350139857981|1.6893841034192338|
+-------+-------+-----------+-----------+------------------+------------------+
only showing top 2 rows



In [28]:
inputs = ['studentClassVec','incomeScaled','balanceScaled']

In [29]:
assembler = VectorAssembler(inputCols=inputs,outputCol='features')
stage += [assembler]
stage

[StringIndexer_fd3912499dc3,
 OneHotEncoder_4d547eb7aeca,
 StringIndexer_32e2f0a8e4a7,
 VectorAssembler_a4ce7f9dcdf6]

In [30]:
# pipe line 
pipeline = Pipeline(stages=stage)
pipelineModel =  pipeline.fit(df)
dataset =  pipelineModel.transform(df)

                                                                                

In [33]:
dataset.show(10)

+-------+-------+-----------+-----------+------------------+------------------+----------+---------------+-----+--------------------+
|default|student|    balance|     income|      incomeScaled|     balanceScaled|studentIdx|studentClassVec|label|            features|
+-------+-------+-----------+-----------+------------------+------------------+----------+---------------+-----+--------------------+
|     No|     No|729.5264952|44361.62507|3.3262970676634867|1.5081742710178534|       0.0|  (1,[0],[1.0])|  0.0|[1.0,3.3262970676...|
|     No|    Yes|817.1804066| 12106.1347|0.9077350139857981|1.6893841034192338|       1.0|      (1,[],[])|  0.0|[0.0,0.9077350139...|
|     No|     No|1073.549164|31767.13895|2.3819447770614217|2.2193837214557224|       0.0|  (1,[0],[1.0])|  0.0|[1.0,2.3819447770...|
|     No|     No|529.2506047|35704.49394|2.6771731943459827|1.0941372934102322|       0.0|  (1,[0],[1.0])|  0.0|[1.0,2.6771731943...|
|     No|     No|785.6558829|38463.49588|2.8840470419162356|1.

In [None]:
# train, test  split-- why?  

In [35]:
# 7 : 3 
(train,test)  = dataset.randomSplit([0.7,0.3],seed=14)

In [39]:
train.count(),test.count(), dataset.count()

(6907, 3093, 10000)

### 학습용과 평가용데이터 준비 끝~~~  (결측치와 이상치. 그리고 피처엔지니어링 과 같은 고급기법은 적용 안함)
### 단지 스케일만 맞춰줌

In [40]:
# 적절한 모델을 준비
lr = LogisticRegression(labelCol='label', featuresCol="features", maxIter=10)

In [43]:
lrModel = lr.fit(train)  # 훈련
predictions =  lrModel.transform(test)  # 예측
predictions.select('default','prediction').show(5)

+-------+----------+
|default|prediction|
+-------+----------+
|     No|       0.0|
|     No|       0.0|
|     No|       0.0|
|     No|       0.0|
|     No|       0.0|
+-------+----------+
only showing top 5 rows



In [47]:
print(predictions.columns)

['default', 'student', 'balance', 'income', 'incomeScaled', 'balanceScaled', 'studentIdx', 'studentClassVec', 'label', 'features', 'rawPrediction', 'probability', 'prediction']


### 평가

In [44]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [48]:
evaluator =  BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
evaluator.evaluate(predictions)  # AUC (Binary Classification Evaluator)

0.9549682684102574

### 위와 같은 방식의 평가는 과연 일반화 시킬수 있을까?
### 교차검증 을 통해 신뢰성 확보
### 각종 파라메터의 값이 변경됨에 따라서 모델 성능이 달라진다.
##### 하이퍼 파라메터 튜닝 기법