# MLlib package of PySpark
스파크 2.0부터 ML 패키지는 데이터프레임에 대해 작동, MLlib 패키지는 RDD에 대해서 작동  

MLlib는 전체적으로 크게 세단계의 머신러닝 기능으로 구분 된다.
- 데이터 전처리 : 피처 추출, 변형, 선택, 카테고리 피처에 대한 해석, 자연어처리
- 머신러닝 알고리즘 : 유명 고급레벨의 회귀, 분류, 군집 알고리즘 지원
- 유틸리티 : 기술통계, 카이스퀘어 테스트, 선형대수, 모델평가

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName('appName2').setMaster('local')
# conf=SparkConf()
conf.set("spark.executor.memory", "4g")
conf.set("spark.driver.memory", "4g")
conf.set("spark.cores.max", "4")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

미국 2014 ~ 2015 출생 데이터의 일부  
원본 데이이터는 300개의 컬럼으로 구성, 구중 85개 선별 해서 사용  
799만개의 데이터 중 45429개 균등 샘플링  
INFANT_ALIVE_AT_REPORT가 1인지 0인지 예측하는 것이 목표


In [2]:
import pyspark.sql.types as typ
labels = [
    ('INFANT_ALIVE_AT_REPORT', typ.StringType()),
    ('BIRTH_YEAR', typ.IntegerType()),
    ('BIRTH_MONTH', typ.IntegerType()),
    ('BIRTH_PLACE', typ.StringType()),
    ('MOTHER_AGE_YEARS', typ.IntegerType()),
    ('MOTHER_RACE_6CODE', typ.StringType()),
    ('MOTHER_EDUCATION', typ.StringType()),
    ('FATHER_COMBINED_AGE', typ.IntegerType()),
    ('FATHER_EDUCATION', typ.StringType()),
    ('MONTH_PRECARE_RECODE', typ.StringType()),
    ('CIG_BEFORE', typ.IntegerType()),
    ('CIG_1_TRI', typ.IntegerType()),
    ('CIG_2_TRI', typ.IntegerType()),
    ('CIG_3_TRI', typ.IntegerType()),
    ('MOTHER_HEIGHT_IN', typ.IntegerType()),
    ('MOTHER_BMI_RECODE', typ.IntegerType()),
    ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
    ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
    ('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
    ('DIABETES_PRE', typ.StringType()),
    ('DIABETES_GEST', typ.StringType()),
    ('HYP_TENS_PRE', typ.StringType()),
    ('HYP_TENS_GEST', typ.StringType()),
    ('PREV_BIRTH_PRETERM', typ.StringType()),
    ('NO_RISK', typ.StringType()),
    ('NO_INFECTIONS_REPORTED', typ.StringType()),
    ('LABOR_IND', typ.StringType()),
    ('LABOR_AUGM', typ.StringType()),
    ('STEROIDS', typ.StringType()),
    ('ANTIBIOTICS', typ.StringType()),
    ('ANESTHESIA', typ.StringType()),
    ('DELIV_METHOD_RECODE_COMB', typ.StringType()),
    ('ATTENDANT_BIRTH', typ.StringType()),
    ('APGAR_5', typ.IntegerType()),
    ('APGAR_5_RECODE', typ.StringType()),
    ('APGAR_10', typ.IntegerType()),
    ('APGAR_10_RECODE', typ.StringType()),
    ('INFANT_SEX', typ.StringType()),
    ('OBSTETRIC_GESTATION_WEEKS', typ.IntegerType()),
    ('INFANT_WEIGHT_GRAMS', typ.IntegerType()),
    ('INFANT_ASSIST_VENTI', typ.StringType()),
    ('INFANT_ASSIST_VENTI_6HRS', typ.StringType()),
    ('INFANT_NICU_ADMISSION', typ.StringType()),
    ('INFANT_SURFACANT', typ.StringType()),
    ('INFANT_ANTIBIOTICS', typ.StringType()),
    ('INFANT_SEIZURES', typ.StringType()),
    ('INFANT_NO_ABNORMALITIES', typ.StringType()),
    ('INFANT_ANCEPHALY', typ.StringType()),
    ('INFANT_MENINGOMYELOCELE', typ.StringType()),
    ('INFANT_LIMB_REDUCTION', typ.StringType()),
    ('INFANT_DOWN_SYNDROME', typ.StringType()),
    ('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', typ.StringType()),
    ('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', typ.StringType()),
    ('INFANT_BREASTFED', typ.StringType())
]

schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels])

In [3]:
births = spark.read.csv('births_train.csv.gz', header=True, schema=schema)

In [4]:
type(births)

pyspark.sql.dataframe.DataFrame

In [5]:
births.printSchema()

root
 |-- INFANT_ALIVE_AT_REPORT: string (nullable = true)
 |-- BIRTH_YEAR: integer (nullable = true)
 |-- BIRTH_MONTH: integer (nullable = true)
 |-- BIRTH_PLACE: string (nullable = true)
 |-- MOTHER_AGE_YEARS: integer (nullable = true)
 |-- MOTHER_RACE_6CODE: string (nullable = true)
 |-- MOTHER_EDUCATION: string (nullable = true)
 |-- FATHER_COMBINED_AGE: integer (nullable = true)
 |-- FATHER_EDUCATION: string (nullable = true)
 |-- MONTH_PRECARE_RECODE: string (nullable = true)
 |-- CIG_BEFORE: integer (nullable = true)
 |-- CIG_1_TRI: integer (nullable = true)
 |-- CIG_2_TRI: integer (nullable = true)
 |-- CIG_3_TRI: integer (nullable = true)
 |-- MOTHER_HEIGHT_IN: integer (nullable = true)
 |-- MOTHER_BMI_RECODE: integer (nullable = true)
 |-- MOTHER_PRE_WEIGHT: integer (nullable = true)
 |-- MOTHER_DELIVERY_WEIGHT: integer (nullable = true)
 |-- MOTHER_WEIGHT_GAIN: integer (nullable = true)
 |-- DIABETES_PRE: string (nullable = true)
 |-- DIABETES_GEST: string (nullable = true)


In [6]:
births.columns

['INFANT_ALIVE_AT_REPORT',
 'BIRTH_YEAR',
 'BIRTH_MONTH',
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'MOTHER_RACE_6CODE',
 'MOTHER_EDUCATION',
 'FATHER_COMBINED_AGE',
 'FATHER_EDUCATION',
 'MONTH_PRECARE_RECODE',
 'CIG_BEFORE',
 'CIG_1_TRI',
 'CIG_2_TRI',
 'CIG_3_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_BMI_RECODE',
 'MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT',
 'MOTHER_WEIGHT_GAIN',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM',
 'NO_RISK',
 'NO_INFECTIONS_REPORTED',
 'LABOR_IND',
 'LABOR_AUGM',
 'STEROIDS',
 'ANTIBIOTICS',
 'ANESTHESIA',
 'DELIV_METHOD_RECODE_COMB',
 'ATTENDANT_BIRTH',
 'APGAR_5',
 'APGAR_5_RECODE',
 'APGAR_10',
 'APGAR_10_RECODE',
 'INFANT_SEX',
 'OBSTETRIC_GESTATION_WEEKS',
 'INFANT_WEIGHT_GRAMS',
 'INFANT_ASSIST_VENTI',
 'INFANT_ASSIST_VENTI_6HRS',
 'INFANT_NICU_ADMISSION',
 'INFANT_SURFACANT',
 'INFANT_ANTIBIOTICS',
 'INFANT_SEIZURES',
 'INFANT_NO_ABNORMALITIES',
 'INFANT_ANCEPHALY',
 'INFANT_MENINGOMYELOCELE',
 'INFANT_LIMB

In [7]:
selected_features = [
 'INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_BEFORE',
 'CIG_1_TRI',
 'CIG_2_TRI',
 'CIG_3_TRI',
 'MOTHER_HEIGHT_IN',
#  'MOTHER_BMI_RECODE',
 'MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT',
 'MOTHER_WEIGHT_GAIN',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM'
]
births_trimmed = births.select(selected_features)

In [8]:
births_trimmed.head()

Row(INFANT_ALIVE_AT_REPORT='N', BIRTH_PLACE='1', MOTHER_AGE_YEARS=29, FATHER_COMBINED_AGE=99, CIG_BEFORE=99, CIG_1_TRI=99, CIG_2_TRI=99, CIG_3_TRI=99, MOTHER_HEIGHT_IN=99, MOTHER_PRE_WEIGHT=999, MOTHER_DELIVERY_WEIGHT=999, MOTHER_WEIGHT_GAIN=99, DIABETES_PRE='N', DIABETES_GEST='N', HYP_TENS_PRE='N', HYP_TENS_GEST='N', PREV_BIRTH_PRETERM='N')

데이터셋에는 Yes/No/Unkown값을 가진 컬럼들이 많다.  
Yes는 1, 나머지는 0으로 변환  

흡연량 관련 레코드 :  
0 : 임신 기간 동안 금연  
1-97 : 1-97개 사이  
98 : 98개 이상  
99 : 알 수 없음.  

In [9]:
# Y : Yes
# N : No
# U : Unkown
recode_dictionary = {
    'YNU': {'Y': 1, 'N': 0, 'U': 0}
}

In [10]:
import pyspark.sql.functions as func
def recode(col, key):
    return recode_dictionary[key][col]

In [11]:
def correct_cig(feat):
    return func.when(func.col(feat) != 99, func.col(feat)).otherwise(0)

In [12]:
rec_integer = func.udf(recode, typ.IntegerType())

In [13]:
births_transformed = births_trimmed.withColumn('CIG_BEFORE', correct_cig('CIG_BEFORE'))\
                                    .withColumn('CIG_1_TRI', correct_cig('CIG_1_TRI'))\
                                    .withColumn('CIG_2_TRI', correct_cig('CIG_2_TRI'))\
                                    .withColumn('CIG_3_TRI', correct_cig('CIG_3_TRI'))

In [14]:
cols = [(col.name, col.dataType) for col in births_trimmed.schema]
cols

[('INFANT_ALIVE_AT_REPORT', StringType),
 ('BIRTH_PLACE', StringType),
 ('MOTHER_AGE_YEARS', IntegerType),
 ('FATHER_COMBINED_AGE', IntegerType),
 ('CIG_BEFORE', IntegerType),
 ('CIG_1_TRI', IntegerType),
 ('CIG_2_TRI', IntegerType),
 ('CIG_3_TRI', IntegerType),
 ('MOTHER_HEIGHT_IN', IntegerType),
 ('MOTHER_PRE_WEIGHT', IntegerType),
 ('MOTHER_DELIVERY_WEIGHT', IntegerType),
 ('MOTHER_WEIGHT_GAIN', IntegerType),
 ('DIABETES_PRE', StringType),
 ('DIABETES_GEST', StringType),
 ('HYP_TENS_PRE', StringType),
 ('HYP_TENS_GEST', StringType),
 ('PREV_BIRTH_PRETERM', StringType)]

In [15]:
births_trimmed.schema

StructType(List(StructField(INFANT_ALIVE_AT_REPORT,StringType,true),StructField(BIRTH_PLACE,StringType,true),StructField(MOTHER_AGE_YEARS,IntegerType,true),StructField(FATHER_COMBINED_AGE,IntegerType,true),StructField(CIG_BEFORE,IntegerType,true),StructField(CIG_1_TRI,IntegerType,true),StructField(CIG_2_TRI,IntegerType,true),StructField(CIG_3_TRI,IntegerType,true),StructField(MOTHER_HEIGHT_IN,IntegerType,true),StructField(MOTHER_PRE_WEIGHT,IntegerType,true),StructField(MOTHER_DELIVERY_WEIGHT,IntegerType,true),StructField(MOTHER_WEIGHT_GAIN,IntegerType,true),StructField(DIABETES_PRE,StringType,true),StructField(DIABETES_GEST,StringType,true),StructField(HYP_TENS_PRE,StringType,true),StructField(HYP_TENS_GEST,StringType,true),StructField(PREV_BIRTH_PRETERM,StringType,true)))

In [16]:
YNU_cols = []
for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        dis = births.select(s[0]).distinct().rdd.map(lambda row: row[0]).collect()
    if 'Y' in dis:
        YNU_cols.append(s[0])

In [19]:
births.select('INFANT_ALIVE_AT_REPORT').distinct().rdd.map(lambda row: row[0]).collect()

['Y', 'N']

In [20]:
births.select('BIRTH_PLACE').distinct().rdd.map(lambda row: row[0]).collect()

['7', '3', '5', '6', '9', '1', '4', '2']

In [None]:
births.select(['INFANT_NICU_ADMISSION', rec_integer('INFANT_NICU_ADMISSION', 
                                                    func.lit('YNU')).alias('INFANT_NICU_ADMISSION_RECODE')]).take(5)

In [17]:
births_transformed.columns

['INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_BEFORE',
 'CIG_1_TRI',
 'CIG_2_TRI',
 'CIG_3_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT',
 'MOTHER_WEIGHT_GAIN',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [18]:
exprs_YNU = [
    rec_integer(x, func.lit('YNU')).alias(x)
    if x in YNU_cols
    else x
    for x in births_transformed.columns
]

In [19]:
exprs_YNU

[Column<b'recode(INFANT_ALIVE_AT_REPORT, YNU) AS `INFANT_ALIVE_AT_REPORT`'>,
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_BEFORE',
 'CIG_1_TRI',
 'CIG_2_TRI',
 'CIG_3_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT',
 'MOTHER_WEIGHT_GAIN',
 Column<b'recode(DIABETES_PRE, YNU) AS `DIABETES_PRE`'>,
 Column<b'recode(DIABETES_GEST, YNU) AS `DIABETES_GEST`'>,
 Column<b'recode(HYP_TENS_PRE, YNU) AS `HYP_TENS_PRE`'>,
 Column<b'recode(HYP_TENS_GEST, YNU) AS `HYP_TENS_GEST`'>,
 Column<b'recode(PREV_BIRTH_PRETERM, YNU) AS `PREV_BIRTH_PRETERM`'>]

In [20]:
births_transformed = births_transformed.select(exprs_YNU)

In [21]:
births_transformed.select(YNU_cols[-5:]).show(5)

+------------+-------------+------------+-------------+------------------+
|DIABETES_PRE|DIABETES_GEST|HYP_TENS_PRE|HYP_TENS_GEST|PREV_BIRTH_PRETERM|
+------------+-------------+------------+-------------+------------------+
|           0|            0|           0|            0|                 0|
|           0|            0|           0|            0|                 0|
|           0|            0|           0|            0|                 0|
|           0|            0|           0|            0|                 1|
|           0|            0|           0|            0|                 0|
+------------+-------------+------------+-------------+------------------+
only showing top 5 rows



In [22]:
YNU_cols[-5:]

['DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [23]:
# Descriptive statistids
import pyspark.mllib.stat as st
import numpy as np

In [24]:
numeric_cols = [
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_BEFORE',
 'CIG_1_TRI',
 'CIG_2_TRI',
 'CIG_3_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT',
 'MOTHER_WEIGHT_GAIN'
]

In [25]:
numeric_rdd = births_transformed.select(numeric_cols).rdd.map(lambda row: [e for e in row])

In [26]:
mllib_stats = st.Statistics.colStats(numeric_rdd)

In [27]:
for col, m, v in zip(numeric_cols, mllib_stats.mean(), mllib_stats.variance()):
    print('{0}: \t{1:.2f} \t{2:.2f}'.format(col, m, np.sqrt(v)))

MOTHER_AGE_YEARS: 	28.30 	6.08
FATHER_COMBINED_AGE: 	44.55 	27.55
CIG_BEFORE: 	1.43 	5.18
CIG_1_TRI: 	0.91 	3.83
CIG_2_TRI: 	0.70 	3.31
CIG_3_TRI: 	0.58 	3.11
MOTHER_HEIGHT_IN: 	65.12 	6.45
MOTHER_PRE_WEIGHT: 	214.50 	210.21
MOTHER_DELIVERY_WEIGHT: 	223.63 	180.01
MOTHER_WEIGHT_GAIN: 	30.74 	26.23


colStats()함수는 기술통계를 샘플링해서 계산한다.  
실제 데이터에서는 큰 문제는 없다, 데이터셋이 100개 미만이리 경우에는 문제가 될 수 있다.  
colStats()함수는 RDD데이터를 취해 기술 통계를 계산한다.  
MulhvariateStahshcalSummary객체를 리턴  
- count() : 데이터 행 갯수
- max() : 최댓값
- min() : 최솟값
- mean() : 평균
- numNonzeros() : 0이 아닌 값의 갯수
- vriance() : 분산
- norml1() : L1-Norm값
- NORML2() : L2-Norm값

- L2 Loss는 직관적으로 오차의 제곱을 더함, Outlier에 큰 영향을 받는다.
- L1 Loss가 L2 Loss에 비해  Outlier에 대하여 더 Robust하다.  
Outlier가 적당히 무시되길 원한다면 L1 Loss, Outlier에 신경을 써야 한다면 L2 Loss를 사용 한다.

In [28]:
categorical_cols = [e for e in births_transformed.columns if e not in numeric_cols]

In [29]:
categorical_cols

['INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [30]:
categorical_rdd = births_transformed.select(categorical_cols).rdd.map(lambda row: [e for e in row])

In [32]:
for i , col in enumerate(categorical_cols):
    agg = categorical_rdd.groupBy(lambda row: row[i]).map(lambda row: (row[0], len(row[1])))
    print(col, sorted(agg.collect(), key=lambda el: el[1], reverse=True))

INFANT_ALIVE_AT_REPORT [(1, 23349), (0, 22080)]
BIRTH_PLACE [('1', 44558), ('4', 327), ('3', 224), ('2', 136), ('7', 91), ('5', 74), ('6', 11), ('9', 8)]
DIABETES_PRE [(0, 44881), (1, 548)]
DIABETES_GEST [(0, 43451), (1, 1978)]
HYP_TENS_PRE [(0, 44348), (1, 1081)]
HYP_TENS_GEST [(0, 43302), (1, 2127)]
PREV_BIRTH_PRETERM [(0, 43088), (1, 2341)]


In [33]:
corrs = st.Statistics.corr(numeric_rdd)

In [34]:
for i, el in enumerate(corrs > 0.5):
    correlated = [
        (numeric_cols[j], corrs[i][j])
        for j, e in enumerate(el)
        if e == 1.0 and j != i
    ]
    if len(correlated) > 0:
        for e in correlated:
            print('{0} - to - {1}: {2:.2f}'.format(numeric_cols[i], e[0], e[1]))

CIG_BEFORE - to - CIG_1_TRI: 0.83
CIG_BEFORE - to - CIG_2_TRI: 0.72
CIG_BEFORE - to - CIG_3_TRI: 0.62
CIG_1_TRI - to - CIG_BEFORE: 0.83
CIG_1_TRI - to - CIG_2_TRI: 0.87
CIG_1_TRI - to - CIG_3_TRI: 0.76
CIG_2_TRI - to - CIG_BEFORE: 0.72
CIG_2_TRI - to - CIG_1_TRI: 0.87
CIG_2_TRI - to - CIG_3_TRI: 0.89
CIG_3_TRI - to - CIG_BEFORE: 0.62
CIG_3_TRI - to - CIG_1_TRI: 0.76
CIG_3_TRI - to - CIG_2_TRI: 0.89
MOTHER_PRE_WEIGHT - to - MOTHER_DELIVERY_WEIGHT: 0.54
MOTHER_PRE_WEIGHT - to - MOTHER_WEIGHT_GAIN: 0.65
MOTHER_DELIVERY_WEIGHT - to - MOTHER_PRE_WEIGHT: 0.54
MOTHER_DELIVERY_WEIGHT - to - MOTHER_WEIGHT_GAIN: 0.60
MOTHER_WEIGHT_GAIN - to - MOTHER_PRE_WEIGHT: 0.65
MOTHER_WEIGHT_GAIN - to - MOTHER_DELIVERY_WEIGHT: 0.60


In [35]:
births_transformed.columns

['INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_BEFORE',
 'CIG_1_TRI',
 'CIG_2_TRI',
 'CIG_3_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'MOTHER_DELIVERY_WEIGHT',
 'MOTHER_WEIGHT_GAIN',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [36]:
features_to_keep = [
 'INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'MOTHER_AGE_YEARS',
 'FATHER_COMBINED_AGE',
 'CIG_1_TRI',
 'MOTHER_HEIGHT_IN',
 'MOTHER_PRE_WEIGHT',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM'
]

In [37]:
births_transformed = births_transformed.select([e for e in features_to_keep])

In [38]:
# 카테고리 피처에 대한 확인
import pyspark.mllib.linalg as ln

In [39]:
for cat in categorical_cols[1:]:
    agg = births_transformed.groupby('INFANT_ALIVE_AT_REPORT').pivot(cat).count()
    agg_rdd = agg.rdd.map(lambda row: (row[1:])).flatMap(lambda row: [0 if e == None else e for e in row]).collect()
    row_length = len(agg.collect()[0]) - 1
    agg = ln.Matrices.dense(row_length, 2, agg_rdd)
    test = st.Statistics.chiSqTest(agg)
    print(cat, round(test.pValue, 4))

BIRTH_PLACE 0.0
DIABETES_PRE 0.0
DIABETES_GEST 0.0
HYP_TENS_PRE 0.0
HYP_TENS_GEST 0.0
PREV_BIRTH_PRETERM 0.0


In [43]:
import pyspark.mllib.feature as ft
import pyspark.mllib.regression as reg
hashing = ft.HashingTF(7)
births_hashed = births_transformed.rdd.map(lambda row: [list(hashing.transform(row[1]).toArray()) 
                                                        if col == 'BIRTH_PLACE' else row[i]
                                                        for i, col in enumerate(features_to_keep)])\
                                        .map(lambda row: [[e] if type(e) == int else e for e in row])\
                                        .map(lambda row: [item for sublist in row for item in sublist])\
                                        .map(lambda row: reg.LabeledPoint(row[0], ln.Vectors.dense(row[1:])))

In [44]:
births_train, births_test = births_hashed.randomSplit([0.6, 0.4])

In [45]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
LR_Model = LogisticRegressionWithLBFGS.train(births_train, iterations=10)

In [46]:
LR_results = (
    births_test.map(lambda row: row.label).zip(LR_Model.predict(births_test.map(lambda row: row.features)))
).map(lambda row: (row[0], row[1] * 1.0))

In [None]:
LR_results.collect()

In [48]:
import pyspark.mllib.evaluation as ev
LR_evaluation = ev.BinaryClassificationMetrics(LR_results)

In [49]:
print('Area under PR: {0:.2f}'.format(LR_evaluation.areaUnderPR))

Area under PR: 0.79


In [50]:
print('Area under ROC: {0:.2f}'.format(LR_evaluation.areaUnderROC))

Area under ROC: 0.63


In [51]:
selector = ft.ChiSqSelector(4).fit(births_train)

In [52]:
topFeatues_train = (
    births_train.map(lambda row: row.label).zip(selector.transform(births_train.map(lambda row: row.features)))
).map(lambda row: reg.LabeledPoint(row[0], row[1]))

In [53]:
topFeatures_test = (
    births_test.map(lambda row: row.label).zip(selector.transform(births_test.map(lambda row: row.features)))
).map(lambda row: reg.LabeledPoint(row[0], row[1]))

In [57]:
from pyspark.mllib.tree import RandomForest
RF_model = RandomForest.trainClassifier(data=topFeatues_train, numClasses=2, categoricalFeaturesInfo={},
                                       numTrees=6, featureSubsetStrategy='all', seed=5)

trainClassifier
- 첫번째 : 트레인데이타
- 두번째 : 타깃 갯수
- 서번째 : 딕셔너리타입 (키 : 학습데이터셋 RDD의 카테고리 피처의 인덱스, 값: 각 카테고리 피처가 가지고있는 레벨 수)
- 네번째 : 랜덤포레스트 모델에서 생성할 트리 수

In [58]:
RF_results = (
    topFeatures_test.map(lambda row: row.label).zip(RF_model.predict(topFeatures_test.map(lambda row: row.features)))
)

In [59]:
RE_evaluation = ev.BinaryClassificationMetrics(RF_results)
print('Area under PR: {0:.2f}'.format(RE_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}'.format(RE_evaluation.areaUnderROC))

Area under PR: 0.77
Area under ROC: 0.62
