In [1]:
%pylab inline

from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression

Populating the interactive namespace from numpy and matplotlib


In [2]:
raw_data=sc.textFile("/home/csxion/Desktop/project/data/p9_data_11_16_1q_sed.csv")
records=raw_data.map(lambda line:line.split(","))
records.first()

['가락시장역',
 '2016-01-01',
 '5',
 '0.077606673',
 '0.420556606',
 '0.414592146',
 '0.820293973',
 '0.553426589',
 '0.549782942',
 '0.99369488',
 '78']

In [3]:
subway_dict=records.map(lambda r:r[0]).distinct().zipWithIndex().collectAsMap() # distinct 꼭 사용할 것
time_dict=records.map(lambda r:r[2]).distinct().zipWithIndex().collectAsMap()
#date_dict=records.map(lambda r:r[1]).distinct().zipWithIndex().collectAsMap()

In [4]:
# r[0] : subway station name // Category feature => binary vector
# r[1] : Date(Not used)
# r[2] : Time(5~23) # Category feature => binary vector

def check_error_float(data, econd, subt): #data가 equal_cond와 같을때, subt로 치환한다.
    if(data==econd):
        return subt
    else:
        return float(data)

def extract_features_linear(r): #line별
    name_vec=np.zeros(len(subway_dict)) # 역 이름에 대하여 카테고리 특징을 바이너리 벡터 형태로 변환한다.
    name_vec[subway_dict[r[0]]]=1
    time_vec=np.zeros(len(time_dict)) # 시간 데이터에 대하여(5~23) 카테고리 특징을 바이너리 벡터 형태로 변환한다.
    time_vec[time_dict[r[2]]]=1
    #numbers=np.array([float(r[i]) for i in range(3,len(r)-1)])
    #numbers=np.array([float(e) for e in r[3:]])
    for i in range(3,len(r)-1):
        r[i]=check_error_float(r[i], "#N/A", 0.0)
    numbers=np.array([float(d) for d in r[3:len(r)-1]])
    return np.concatenate((name_vec, time_vec, numbers)) # 역 + 시간 + 특징벡터 값으로 구성되는 numpy.array를 반환한다.
#flatMap을 사용할 필요가 없음

def extract_label(r):
    return float(r[len(r)-1]) # int는 소문자로 지정해야 함
#float

def extract_features_dt(r): # dt모델에 적용할 특징벡터를 추출하는 함수. 여기서는 바이너리 벡터를 생성하지 않는다.
    name=float(subway_dict[r[0]])
    time=float(time_dict[r[2]])
    for i in range(3,len(r)-1):
        r[i]=check_error_float(r[i], "#N/A", 0.0)
    numbers=np.array([float(d) for d in r[3:len(r)-1]])
    return np.concatenate((np.array([name, time]), numbers))

# np.concatenate 함수는 꼭 이중괄호를 해주어야 index 에러가 나지 않는다.
# extract_label 함수의 경우 linearRegression Model과 DecisionTree 모두 동일하게 사용한다

def squared_error(actual, pred):
    return (actual-pred)**2
def abs_error(actual,pred):
    return np.abs(actual-pred)
def squared_log_error(actual, pred):
    return (np.log(pred+1)-np.log(actual+1))**2

### LinearRegression(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, standardization=True, solver="auto", weightCol=None, aggregationDepth=2)

- elasticNetParam = Param(parent='undefined', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.')

- maxIter = Param(parent='undefined', name='maxIter', doc='max number of iterations (>= 0).')
- regParam = Param(parent='undefined', name='regParam', doc='regularization parameter (>= 0).')

###### https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.regression.LinearRegression

### Regularization
- none (a.k.a. ordinary least squares)
- L2 (ridge regression)
- L1 (Lasso)
- L2 + L1 (elastic net)

In [5]:
#df에 대한 내용을 정리한다.
df=spark.createDataFrame(records.map(lambda line:(extract_label(line), Vectors.dense(extract_features_linear(line))))).toDF("label", "features")
(train_df, test_df) = df.randomSplit([0.7, 0.3]) #training Data와 Test Data를 7 : 3의 비율로 나눈다
df.cache
train_df.cache
test_df.cache

<bound method DataFrame.cache of DataFrame[label: double, features: vector]>

### Evaluation

#### 현재 함수 실행시, 정확한 값을 예측하지 못함(음수출력)
따라서 rmsle와 같은 log 참조 값을 사용할때 제대로 된 값을 사용하지 못함

df.rdd를 사용하면 dataframe에서 RDD로 객체를 전환시킬 수 있음
#### 하지만 RDD를 사용하게 되면 MemoryStore에서 메모리부족으로 에러가 남. 다른 방법을 생각해야할 듯

In [6]:
#LinearRegression 모델 테스트 메소드, 무슨 문제인지는 몰라도 제대로 예측하지 못한다.
def evaluate_lg(train, test, miter, rparam, enparam): # exist bugs
    linear = LinearRegression(maxIter=miter, regParam=rparam, elasticNetParam=enparam)
    model=linear.fit(train) #train_df로 학습한다.
    tp=model.transform(test.sample(True, 0.2, 42)).select('label','prediction').collect()
    rmse=np.sqrt(np.mean([squared_error(i[0],i[1]) for i in tp])) # rmsle -> rmse로 변경
    return rmse

    #model=LinearRegressionWithSGD.train(train, iterations, step, regParam=regParam, regType=regType, intercept=intercept)
    #lr2=LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
    #tp_rdd=model.transform(test_df).select('label', 'prediction').rdd # 함수 실행시키는게 rdd와 다름. map이라는 함수가 없으니
    #rmsle=np.sqrt(tp_rdd.map(lambda p:squared_log_error(p['label'], p['prediction'])).mean())

### 각 case별 실행 결과 비교
1) LinearRegression.set..
2) LinearRegression(...)
3) evaluate_lg(...)

###### ∴ 1,2번간에는 차이가 없지만, 3번에서는 예측을 다르게 함(음수값이 포함됨)

In [31]:
lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8) # LinearRegression 모델의 parameter를 설정한다.
lrModel=lr.fit(df)

#lr2=LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
#lrModel2=lr2.fit(df)

t=lrModel.transform(test_df).select('label', 'prediction').collect() # 20%의 데이터, 42는 random seed value
#t2=lrModel.transform(df.sample(True, 0.2, 42)).collect()
#.transform 메소드를 활용하여 데이터를 테스트 할 수 있다.

#lrModel.transform(df.sample(True, 0.2, 42)).select('label', 'prediction').take(20)
# RDD보다 비교적 정확한 데이터를 확인할 수 있었다.

print(np.sqrt(np.mean([squared_log_error(i[0],i[1]) for i in t])))
#print(np.sqrt(np.mean([squared_log_error(i[0],i[1]) for i in t2])))
#print(np.sqrt(np.mean([squared_log_error(i[0],i[1]) for i in t3])))
#np.sqrt(np.mean([squared_log_error(i[0],i[1]) for i in a]))
#evaluate함수를 사용하면 에러가 발생하는 것을 확인. 예측값이 다른듯

#t3=evaluate_lg(df, df, 10, 0.3, 0.8)

#이렇게 실행하면 무조건 에러가 나지 않음
# select('label', 'prediction') 문제인것 같은데?

lrModel.transform(test_df).take(30)



nan


[Row(label=2.0, features=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0816, 0.4206, 0.6035, 0.8203, 0.5534, 0.5498, 0.9937]), prediction=-2606.692900862671),
 Row(label=2.0, features=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

#### * paramMap을 활용하여 여러가지 코드를 테스트 하도록 한다.

In [40]:
paramMap = {lr.maxIter: 10, lr.regParam:0.1, lr.elasticNetParam:0.8} # ParamMap이라는 매개변수 구조체로 변수를 넘겨줄 수 있다.
#step size의 경우 RDD에서만 사용하는듯, ml에서는 관련 정보가 없음

#paramMap[lr.maxIter] = 100  # 특정 변수를 지정하여 넘길 수 있다.

lrModel2=lr.fit(df, paramMap) # lr2Model의 경우 변경된 paramter에 대해 학습을 진행한다.
lrModel2.transform(test_df).select('label', 'prediction').collect()

# Refer : https://spark.apache.org/docs/latest/ml-classification-regression.html#regression
# label이 작은 값일수록 오차가 심함

[Row(label=2.0, prediction=-3018.3762655928995),
 Row(label=4.0, prediction=-2517.8672891613596),
 Row(label=4.0, prediction=-2600.8536036542646),
 Row(label=4.0, prediction=-2098.6745691599945),
 Row(label=4.0, prediction=-2751.9464253093593),
 Row(label=5.0, prediction=-2682.836437584664),
 Row(label=7.0, prediction=-2982.1895284928614),
 Row(label=7.0, prediction=-2592.222229113144),
 Row(label=8.0, prediction=-2245.2978656518635),
 Row(label=8.0, prediction=-2504.1273631872127),
 Row(label=9.0, prediction=-2620.8392786903105),
 Row(label=10.0, prediction=-2494.9491433764924),
 Row(label=10.0, prediction=-2212.833743780263),
 Row(label=10.0, prediction=-2258.249607514571),
 Row(label=10.0, prediction=-2760.586053183143),
 Row(label=10.0, prediction=-2854.16500028688),
 Row(label=11.0, prediction=-2588.9247993911786),
 Row(label=11.0, prediction=-1535.8590097390907),
 Row(label=12.0, prediction=-1825.1923286620959),
 Row(label=12.0, prediction=-925.4424820530339),
 Row(label=12.0, pr

In [7]:
# regParam의 변화에 따른 값 측정

params=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
metrics=[]
lr = LinearRegression()
paramMap = [{lr.maxIter: 10, lr.regParam:param, lr.elasticNetParam:0.8} for param in params]
##[print(p) for p in paramMap] # paramMap의 형태와 paramMap[dict]의 값 확인
#paramMap : list
#paramMap[n]: dict

for p in paramMap: # 함수와 유사하게 구성, 제대로 동작할지는..
    model=lr.fit(df, p) # 트레이닝 할 집합과 paramMap(파라미터 집합)을 Input
    model.transform(test_df).select('label','prediction').collect()
    tp=model.transform(test_df.sample(True, 0.2, 42)).select('label','prediction').collect()
    metrics.append(np.sqrt(np.mean([squared_error(i[0],i[1]) for i in tp])))
    
print(metrics)

# 이역시도 RuntimeWarning이 발생함.
# 변수 반복사용으로 인한 에러인지는 확인을 해봐야할 듯

[1656.6632121550465, 1656.6605520807889, 1656.5809925465719, 1655.5645071550921, 1655.4356406464392, 1655.021041031566, 1655.6471506604139, 1655.6939205569563, 1655.5738757729343]


In [30]:
lr_test=LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
model=lr_test.fit(df)
tp=model.transform(test_df).select('label','prediction').collect()
print(np.sqrt(np.mean([squared_log_error(i[0],i[1]) for i in tp])))

#여기서도 에러가 나는 것 보면, df-test_df로 이어지는 구문이 문제인건지..
# paramMap을 사용하면 가상 변수로 자동 초기화 되는 그런게 있던건지.. set으로 한번 머신 트레이닝 해보기
# 근데 이게 되면 evaluate 함수가 에러가 나면 안되는거 아닌가..
#nan... 뭐냐 이거

lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8) # LinearRegression 모델의 parameter를 설정한다.
lrModel=lr.fit(df)
t=lrModel.transform(test_df).collect() 
print(np.sqrt(np.mean([squared_log_error(i[0],i[1]) for i in t])))

nan




In [None]:
#LinearRegression
##regParam에 따른 변화값 측정
params=[0.1, 0.2, 0.3, 0.4]
metrics=[evaluate_lg(train_df, test_df, 10, param, 0.8) for param in params]
print(params)
print(metrics)
plot(params, metrics)
fig=matplotlib.pyplot.gcf()

#제대로 된 값이 출력이 안되고, nan값만 출력됨

In [20]:
#DecisionTree 모델 테스트 메소드
def evaluate_dt(train, test, maxDepth, maxBins):
    model=DecisionTree.trainRegressor(train, {}, impurity='variance', maxDepth=maxDepth, maxBins=maxBins)
    preds=model.predict(test.map(lambda p:p.features))
    actual=test.map(lambda p:p.label)
    tp=actual.zip(preds)
    rmsle=np.sqrt(tp.map(lambda p:squared_log_error(p[0], p[1])).mean())
    return rmsle

#depth=32로 한뒤, bins의 최적화 여부를 판단한다.
params=[1,2,3,4,5,10,20]
metrics=[evaluate_dt(train_data_dt, test_data_dt, param, 32) for param in params]
print(params)
print(metrics)
plot(params,metrics)
fig=matplotlib.pyplot.gcf()

''

### GLR(GeneralizedLinearRegression) Model 

refer : http://terms.naver.com/entry.nhn?docId=1924378&cid=42125&categoryId=42125

### 일반화 선형모형

- 종속변수와 독립변수간의 선형결합으로 표현하였으나, 종속변수를 적절한 함수로 변화시킨 f(x)와 독립변수의 선형결합으로 모형화한다.
- 일반선형모형에서 종속변수 y를 f(x)라는 함수로 치환한 형태를 의미
- 일반선형모형에서는 최소제곱법으로 연속형 변수사이의 회귀식을 추정한다.
하지만 실제 데이터에서는 연속적인 데이터가 없는경우가 많음

참고 : http://dermabae.tistory.com/187

In [27]:
# GLR(GeneralizedLinearRegression) Model 테스트 하기
# http://terms.naver.com/entry.nhn?docId=1924378&cid=42125&categoryId=42125

from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

# Fit the model
model=glr.fit(df)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

# Summarize the model over the training set and print out some metrics
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues)) # 두 값의 차이를 표본오차로 나눈 값(=두 값의 차이는 표준 오차의 몇 배인가)
#T value의 절대값이 커지게 되면, 해당 샘플과의 모집단 자체가 많이 틀어짐을 의미함
print("P Values: " + str(summary.pValues))
#귀무가설이 맞다는 전제 하에, 통계값이 실제로 관측된 값 이상일 확률
print("Dispersion: " + str(summary.dispersion)) #분산
print("Null Deviance: " + str(summary.nullDeviance)) # The null deviance shows how well the response is predicted by the model with nothing but an intercept.
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
summary.residuals().show()

model.transform(testData).map(lambda d:(d['label'], d['prediction']).collect()

Coefficients: [-2340.84973224,-102.071197864,-1185.4469747,-656.697476055,-1374.67191933,-81.0970801758,185.302367084,-925.00862181,772.879166214,3930.43861945,2469.75834729,-2129.77393692,-404.757381634,3797.51838649,-1818.01174074,4829.60969311,2213.75458749,1990.62104646,-274.025418591,-490.641177289,-1388.01716045,-1238.18638721,5366.15470376,-1737.66383046,1468.97082819,-1723.53938985,1813.05275766,-1620.8965715,-1757.70441692,2469.49783957,2563.30133011,100.181931213,-456.131012434,-1967.81666097,-1721.64132694,711.952608128,-892.370287695,-748.850665958,-1491.92531431,-1620.16830009,584.850256696,1914.49136587,-1143.90233173,-2411.94015905,5602.90905205,1365.95600377,708.286176002,42.6657373839,778.064991873,1256.2198431,-590.151232409,-450.839192364,2724.98854883,8598.44238621,544.585166214,608.673561971,1263.11617085,845.656913032,1205.97262702,-2290.99957174,-250.913469023,-1255.78033807,-2231.36796086,-1557.13439572,-738.228342986,-1067.53830521,-560.572372837,1671.51006812,

[Row(label=4.0, features=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7035, 0.492, 0.1992, 0.7756, 0.7587, 0.7787, 0.7724]), prediction=-1892.9471314037269),
 Row(label=4.0, features=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [30]:
model.transform(testData).select('label', 'prediction').collect()
# dataframe의 경우 map 함수가 아닌 select함수를 이용하여 거른다.

[Row(label=4.0, prediction=-1892.9471314037269),
 Row(label=4.0, prediction=-2391.265787517062),
 Row(label=6.0, prediction=-2340.8048242557356),
 Row(label=8.0, prediction=-2064.370819500888),
 Row(label=9.0, prediction=-1889.526747209526),
 Row(label=10.0, prediction=-2226.67939843861),
 Row(label=11.0, prediction=-2259.3218281877903),
 Row(label=11.0, prediction=-2201.74428718639),
 Row(label=13.0, prediction=-1797.3709669983796),
 Row(label=14.0, prediction=-1523.0622739738142),
 Row(label=15.0, prediction=-2471.707389878165),
 Row(label=16.0, prediction=-1857.808335626773),
 Row(label=17.0, prediction=-386.5179775995507),
 Row(label=18.0, prediction=-1314.8506711050554),
 Row(label=18.0, prediction=-1764.058515857378),
 Row(label=19.0, prediction=356.0686320824175),
 Row(label=19.0, prediction=356.0686320824175),
 Row(label=20.0, prediction=-881.7613744029964),
 Row(label=20.0, prediction=-1274.4980869269352),
 Row(label=20.0, prediction=-1505.6738285830597),
 Row(label=20.0, pred

In [None]:
# DecisionTree
# VectorIndexer에 대해 알아봐야할듯 
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Load the data stored in LIBSVM format as a DataFrame.
data = spark.read.format("libsvm").load("/usr/local/spark/data/mllib/sample_libsvm_data.txt")

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="features")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

treeModel = model.stages[1]
# summary only
print(treeModel)

In [None]:
trainingData.take(3)

In [32]:
#VectorIndexer test code
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

vi = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),),(Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"])
indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed")
model = indexer.fit(vi)
model.transform(vi).head().indexed
model.numFeatures

2

model.categoryMaps #무언가 카테고리를 나누려 했음. 

In [None]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(df) # indexedFeatures라는 새로운 열 생성
dt = DecisionTreeRegressor(featuresCol="features")
pipeline = Pipeline(stages=[featureIndexer, dt])
#https://spark.apache.org/docs/latest/ml-pipeline.html : 파이프라인에 대한 설명
model = pipeline.fit(trainingData)
# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

treeModel = model.stages[1]
# summary only
print(treeModel)

In [None]:
trainingData.take(5)

In [None]:
testData.take(10)

In [None]:
trainingData.take(78)

In [None]:
df.take(78)