# Project(9) : Machine Learning with Spark

## 2. Loading RDD Model and Testing
---
### 1) Previous Settings
- Import some Library
- Extract Functions
- Load raw data

### 2) Model Load and Test
- Load Model
- Test Value
---

## 1) Previous Settings
- Import some Library
- Extract Functions
- Load raw data
---

### 1-1) Import some Library

In [13]:
%pylab inline
import numpy as np
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors

from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.tree import DecisionTree

# for loading model
from pyspark.mllib.regression import LinearRegressionModel
from pyspark.mllib.tree import DecisionTreeModel

Populating the interactive namespace from numpy and matplotlib


### 1-2) Extract Function

In [14]:
# r[0] : subway station name // Category feature => binary vector
# r[1] : Date(Not used)
# r[2] : Time(5~23) # Category feature => binary vector

def check_error_float(data, econd, subt): #data가 equal_cond와 같을때, subt로 치환한다.
    if(data==econd):
        return subt
    else:
        return float(data)

def extract_features_linear(r): #line별
    name_vec=np.zeros(len(subway_dict)) # 역 이름에 대하여 카테고리 특징을 바이너리 벡터 형태로 변환한다.
    name_vec[subway_dict[r[0]]]=1
    time_vec=np.zeros(len(time_dict)) # 시간 데이터에 대하여(5~23) 카테고리 특징을 바이너리 벡터 형태로 변환한다.
    time_vec[time_dict[r[2]]]=1
    #numbers=np.array([float(r[i]) for i in range(3,len(r)-1)])
    #numbers=np.array([float(e) for e in r[3:]])
    for i in range(3,len(r)-1):
        r[i]=check_error_float(r[i], "#N/A", 0.0)
    numbers=np.array([float(d) for d in r[3:len(r)-1]])
    return np.concatenate((name_vec, time_vec, numbers)) # 역 + 시간 + 특징벡터 값으로 구성되는 numpy.array를 반환한다.
#flatMap을 사용할 필요가 없음

def extract_label(r):
    return float(r[len(r)-1]) # int는 소문자로 지정해야 함
#float

def extract_features_dt(r): # dt모델에 적용할 특징벡터를 추출하는 함수. 여기서는 바이너리 벡터를 생성하지 않는다.
    name=float(subway_dict[r[0]])
    time=float(time_dict[r[2]])
    for i in range(3,len(r)-1):
        r[i]=check_error_float(r[i], "#N/A", 0.0)
    numbers=np.array([float(d) for d in r[3:len(r)-1]])
    return np.concatenate((np.array([name, time]), numbers))

# np.concatenate 함수는 꼭 이중괄호를 해주어야 index 에러가 나지 않는다.
# extract_label 함수의 경우 linearRegression Model과 DecisionTree 모두 동일하게 사용한다



<bound method RDD.cache of PythonRDD[84] at RDD at PythonRDD.scala:48>

### 1-3) Load raw Data

In [15]:
raw_data=sc.textFile("/home/csxion/Desktop/project/data/p9_data_11_16_2q_sed.csv")
#raw_data=sc.textFile("/home/csxion/Desktop/project/data/p9_data_11_*")
records=raw_data.map(lambda line:line.split(","))
subway_dict=records.map(lambda r:r[0]).distinct().zipWithIndex().collectAsMap() # distinct 꼭 사용할 것
time_dict=records.map(lambda r:r[2]).distinct().zipWithIndex().collectAsMap()

data=records.map(lambda r:LabeledPoint(extract_label(r), Vectors.dense(np.array(extract_features_linear(r)))))
data.cache
data_dt=records.map(lambda r:LabeledPoint(extract_label(r), Vectors.dense(extract_features_dt(r))))
data_dt.cache
data.first()

LabeledPoint(124.0, [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.357227455,0.420556606,0.072329177,0.08771192,0.127294678,0.119684128,0.131010076])

In [16]:
data_dt.first()

LabeledPoint(124.0, [39.0,12.0,0.357227455,0.420556606,0.072329177,0.08771192,0.127294678,0.119684128,0.131010076])

## 2) Model Load and Test
- Model load
- Test Value
---

### 2-1) Model load

In [17]:
#model loading
linear_model = LinearRegressionModel.load(sc, "/home/csxion/Desktop/project/code/model/model_01_RDD_LinearRegressionModel")
model_dt=DecisionTreeModel.load(sc, "/home/csxion/Desktop/project/code/model/model_01_RDD_DecistionTreeModel")

### 2-2) Test Value

In [18]:
true_vs_predicted=data.map(lambda p:(p.label, linear_model.predict(p.features)))
true_vs_predicted.take(100)

[(124.0, 1016.3322457054286),
 (535.0, 1108.0368036826742),
 (1447.0, 1115.2961900655287),
 (2356.0, 1336.610459397742),
 (1250.0, 1205.2968261718579),
 (980.0, 1108.8287060324021),
 (957.0, 1201.5069001960933),
 (914.0, 1256.2110298343514),
 (980.0, 1368.4288035344528),
 (988.0, 1387.0256287896595),
 (1039.0, 1473.0377363431921),
 (1231.0, 1426.3626690739438),
 (1311.0, 1503.1199293754057),
 (2114.0, 1460.1403184465667),
 (1652.0, 1412.124302394036),
 (980.0, 1371.9688448968095),
 (880.0, 1308.6209361579199),
 (887.0, 1105.1122296708481),
 (538.0, 1071.7379078260662),
 (977.0, 1072.0749766345386),
 (3476.0, 1090.6556711039304),
 (9690.0, 1134.9402205436445),
 (18569.0, 1217.6149242246108),
 (15013.0, 1180.1796158952839),
 (8528.0, 1165.583124220095),
 (7861.0, 1257.8693240256816),
 (8149.0, 1271.5843634395023),
 (10348.0, 1283.3601378849253),
 (10077.0, 1388.1200660768382),
 (11074.0, 1429.8500145944545),
 (11979.0, 1444.5889925016272),
 (15073.0, 1383.093076651347),
 (25783.0, 1387.9

In [19]:
preds=model_dt.predict(data.map(lambda d:d.features))
actual=data_dt.map(lambda d:d.label)
true_vs_predicted_dt=actual.zip(preds)
true_vs_predicted_dt.take(20)

# 너무나 당연하게도, 제대로 예측하지 못함. 표준 분포 자체는 전체 데이터를 기준으로 정규화 되어있는 반면에, 트레이닝 한 데이터와 테스트 데이터 간의 이격이 크다고 판단됨

[(124.0, 118.02083333333333),
 (535.0, 118.02083333333333),
 (1447.0, 118.02083333333333),
 (2356.0, 118.02083333333333),
 (1250.0, 118.02083333333333),
 (980.0, 118.02083333333333),
 (957.0, 118.02083333333333),
 (914.0, 118.02083333333333),
 (980.0, 118.02083333333333),
 (988.0, 118.02083333333333),
 (1039.0, 118.02083333333333),
 (1231.0, 118.02083333333333),
 (1311.0, 118.02083333333333),
 (2114.0, 118.02083333333333),
 (1652.0, 118.02083333333333),
 (980.0, 118.02083333333333),
 (880.0, 118.02083333333333),
 (887.0, 118.02083333333333),
 (538.0, 118.02083333333333),
 (977.0, 118.02083333333333)]

# Conclusion

- Training된 모델의 Data(1/4분기 학습)와 Test Data(2/4분기)에서 다루는 데이터의 범위가 일정량 겹치기는 하지만, 전체적으로 범위 내에 들어오지 않음.
- 따라서 모델에 비해 새로들어오는 데이터는 '예상치 못한 데이터'로 간주, 정확한 값을 리턴할 수가 없음
- 모델 저장 및 로드 기능에 대해 확인해보았지만, 실제 사용을 할 수는 없었음(데이터 범위 이격에 따른)