In [78]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression 
from pyspark.sql.functions import col,stddev_samp

In [79]:
# read data  - 수집
# data preprocessing  -EDA 
# model create or choice
# training
# predict(classification / regression)
# score
# 배포 - 엔지니어링

In [80]:
train_path = 'titanic/train.csv'
test_path = 'titanic/test.csv'

In [81]:
df = spark.read.format('csv')\
    .options(header = 'true', inferSchema = 'true')\
    .load(train_path).cache()

24/03/26 18:25:31 WARN CacheManager: Asked to cache already cached data.


In [82]:
df.show(2)

+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0| PC 17599|71.2833|  C85|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
only showing top 2 rows



In [83]:
df.select("Pclass").distinct().show()

+------+
|Pclass|
+------+
|     1|
|     3|
|     2|
+------+



DF 재구성 : 학습용컬럼과 정답컬럼만 추출

In [101]:
train_df = df.select('Survived','Sex','Pclass','Age','Fare')
train_df.show(2)

+--------+------+------+----+-------+
|Survived|   Sex|Pclass| Age|   Fare|
+--------+------+------+----+-------+
|       0|  male|     3|22.0|   7.25|
|       1|female|     1|38.0|71.2833|
+--------+------+------+----+-------+
only showing top 2 rows



결측치 확인하기

In [102]:
from pyspark.sql.functions import *
train_df.select([count(when(isnull(c), c)).alias(c) for c in train_df.columns]).show()

+--------+---+------+---+----+
|Survived|Sex|Pclass|Age|Fare|
+--------+---+------+---+----+
|       0|  0|     0|177|   0|
+--------+---+------+---+----+



결측치 채우기
성별에따른 객실등에대한 Age들의 평균값으로 채운다

In [103]:
avg_mean = train_df.groupBy('Sex','Pclass').agg(avg('Age')).alias("avg_mean")
avg_mean.show()

+------+------+------------------+
|   Sex|Pclass|          avg(Age)|
+------+------+------------------+
|  male|     3|26.507588932806325|
|female|     3|             21.75|
|female|     1| 34.61176470588235|
|female|     2|28.722972972972972|
|  male|     2| 30.74070707070707|
|  male|     1| 41.28138613861386|
+------+------+------------------+



In [114]:
# 성별 및 객실별로 그룹화하여 평균 나이 계산
mean_age_by_sex_pclass = train_df.groupBy("Sex", "Pclass").agg(avg("Age").alias("MeanAge"))

# 결측치를 해당 그룹의 평균 나이로 채우기
filled_df = train_df.join(mean_age_by_sex_pclass, ["Sex", "Pclass"], "left") \
              .withColumn("AgeFilled", when(col("Age").isNull(), col("MeanAge")).otherwise(col("Age"))) \
              .drop("MeanAge")

# 결과 확인
filled_df.show()

+------+------+--------+----+-------+------------------+
|   Sex|Pclass|Survived| Age|   Fare|         AgeFilled|
+------+------+--------+----+-------+------------------+
|  male|     3|       0|22.0|   7.25|              22.0|
|female|     1|       1|38.0|71.2833|              38.0|
|female|     3|       1|26.0|  7.925|              26.0|
|female|     1|       1|35.0|   53.1|              35.0|
|  male|     3|       0|35.0|   8.05|              35.0|
|  male|     3|       0|NULL| 8.4583|26.507588932806325|
|  male|     1|       0|54.0|51.8625|              54.0|
|  male|     3|       0| 2.0| 21.075|               2.0|
|female|     3|       1|27.0|11.1333|              27.0|
|female|     2|       1|14.0|30.0708|              14.0|
|female|     3|       1| 4.0|   16.7|               4.0|
|female|     1|       1|58.0|  26.55|              58.0|
|  male|     3|       0|20.0|   8.05|              20.0|
|  male|     3|       0|39.0| 31.275|              39.0|
|female|     3|       0|14.0| 7

In [115]:
filled_df.count(), df.count()

(891, 891)

In [106]:
# Sex,Pclass # 범주형  StringIndex
# Age        # 연속형  scaling


In [116]:
#  **params  # unpacking
#  strIdx = StringIndexer(inputCol = 'student', outputCol='studentIdx')
###########################################################################

params = {
    'inputCol':'Sex',
    'outputCol':'SexIdx'
}
strIdx = StringIndexer(**params)

In [117]:
# 수치형 데이터에 대해서 
# 의류사이크 2XL XL L M S --> labeling(4,3,2,1,0) ->onehot(10000,01000,00100,00010,00001)  : 이 경우 onehot으로 하면 안됨
params = {
    'inputCol':'SexIdx',
    'outputCol':'SexClassVec'
}
encode = OneHotEncoder(**params)

In [118]:
# 정답
params = {
    'inputCol':'Survived',
    'outputCol':'label'
}
label_strIdx =  StringIndexer(**params)

In [119]:
stage = [strIdx,encode,label_strIdx]

### 범주형데이터에 대한 전처리준비 끝....

### 연속형 데이터 / 수치형 데이터

In [120]:
filled_df.show(1)

+----+------+--------+----+----+---------+
| Sex|Pclass|Survived| Age|Fare|AgeFilled|
+----+------+--------+----+----+---------+
|male|     3|       0|22.0|7.25|     22.0|
+----+------+--------+----+----+---------+
only showing top 1 row



In [126]:
filled_df = filled_df.drop('AgeFilled')
filled_df.show(1)

+----+------+--------+----+----+
| Sex|Pclass|Survived| Age|Fare|
+----+------+--------+----+----+
|male|     3|       0|22.0|7.25|
+----+------+--------+----+----+
only showing top 1 row



In [127]:
numCols = ['Age','Fare']
# scaling
for c in numCols:
    filled_df = filled_df.withColumn(c+'Scaled', col(c) / filled_df.agg(stddev_samp(c)).first()[0])

In [128]:
filled_df.select('AgeScaled','FareScaled').show(3)

+------------------+-------------------+
|         AgeScaled|         FareScaled|
+------------------+-------------------+
|1.5144738264626911|0.14589454188740145|
|2.6159093366173756| 1.4344612962375451|
|1.7898327040013622|0.15947782682174572|
+------------------+-------------------+
only showing top 3 rows



In [129]:
inputs = ['SexClassVec','AgeScaled','FareScaled']

In [130]:
assembler = VectorAssembler(inputCols=inputs,outputCol='features')
stage += [assembler]
stage

[StringIndexer_25db33098c43,
 OneHotEncoder_8a22e722166f,
 StringIndexer_0c62a285b4ac,
 VectorAssembler_91b3592e5aa6]

In [132]:
# pipe line 
pipeline = Pipeline(stages=stage)
pipelineModel =  pipeline.fit(filled_df)
dataset =  pipelineModel.transform(filled_df)

In [133]:
filled_df.show(10)

+------+------+--------+----+-------+-------------------+-------------------+
|   Sex|Pclass|Survived| Age|   Fare|          AgeScaled|         FareScaled|
+------+------+--------+----+-------+-------------------+-------------------+
|  male|     3|       0|22.0|   7.25| 1.5144738264626911|0.14589454188740145|
|female|     1|       1|38.0|71.2833| 2.6159093366173756| 1.4344612962375451|
|female|     3|       1|26.0|  7.925| 1.7898327040013622|0.15947782682174572|
|female|     1|       1|35.0|   53.1| 2.4093901784633722| 1.0685517481684161|
|  male|     3|       0|35.0|   8.05| 2.4093901784633722| 0.1619932499577354|
|  male|     3|       0|NULL| 8.4583|               NULL|0.17020962808913206|
|  male|     1|       0|54.0|51.8625|   3.71734484677206| 1.0436490591221181|
|  male|     3|       0| 2.0| 21.075|0.13767943876933555|   0.42410034072786|
|female|     3|       1|27.0|11.1333|   1.85867242338603|0.22403968319931125|
|female|     2|       1|14.0|30.0708| 0.9637560713853489| 0.6051

In [65]:
# train, test  split-- why?  

In [134]:
# 7 : 3 
(train,test)  = filled_df.randomSplit([0.7,0.3],seed=14)

In [135]:
train.count(),test.count(), dataset.count()

(593, 298, 891)

### 학습용과 평가용데이터 준비 끝~~~  (결측치와 이상치. 그리고 피처엔지니어링 과 같은 고급기법은 적용 안함)
### 단지 스케일만 맞춰줌

나머지 완성하기

In [51]:
# 적절한 모델을 준비
lr = LogisticRegression(labelCol='label', featuresCol="features", maxIter=10)

In [69]:
lrModel = lr.fit(train)  # 훈련
predictions =  lrModel.transform(test)  # 예측
predictions.select('default','prediction').show(5)

+-------+----------+
|default|prediction|
+-------+----------+
|     No|       0.0|
|     No|       0.0|
|     No|       0.0|
|     No|       0.0|
|     No|       0.0|
+-------+----------+
only showing top 5 rows



In [70]:
print(predictions.columns)

['default', 'student', 'balance', 'income', 'incomeScaled', 'balanceScaled', 'studentIdx', 'studentClassVec', 'label', 'features', 'rawPrediction', 'probability', 'prediction']


### 평가

In [71]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [72]:
evaluator =  BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
evaluator.evaluate(predictions)  # AUC (Binary Classification Evaluator)

0.9549682684102574

### 위와 같은 방식의 평가는 과연 일반화 시킬수 있을까?
### 교차검증 을 통해 신뢰성 확보
### 각종 파라메터의 값이 변경됨에 따라서 모델 성능이 달라진다.
##### 하이퍼 파라메터 튜닝 기법