In [1]:
from pyspark.sql import SparkSession

MAX_MEMORY="5g"
spark = SparkSession.builder.appName("taxi-fare-prediciton")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/14 04:01:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
data_dir = "/home/ubuntu/working/spark-examples/data/ml-data"

train_df = spark.read.parquet(f"{data_dir}/train/")
test_df = spark.read.parquet(f"{data_dir}/test/")

                                                                                

In [4]:
stages = []

from pyspark.ml.feature import StringIndexer, OneHotEncoder

# OneHotEncoding을 수행할 컬럼을 지정
cat_features = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]

for c in cat_features:
    # 1. 데이터를 문자열 형식으로 바꿔준다. setHandleInvalid : Null값 같은 데이터를 어떻게 처리 할건지
    cat_indexer = StringIndexer(inputCol=c, outputCol=c+"_idx").setHandleInvalid("keep")
    
    # 2. One Hot Encoding 수행
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c+"_onehot"])
    
    stages += [cat_indexer, onehot_encoder]

stages

[StringIndexer_de08295cb5e6,
 OneHotEncoder_8018e6f479ff,
 StringIndexer_5f4856e73326,
 OneHotEncoder_a207b9604308,
 StringIndexer_0d28339e1487,
 OneHotEncoder_b475eadf11aa]

In [5]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

num_features = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]

for n in num_features:
    
    # 각각의 컬럼의 데이터가 벡터화. ex) 1.5 -> [1.5]
    num_assembler = VectorAssembler(inputCols=[n], outputCol=n+"_vector")
    
    # StandardScaling 수행
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol=n+"_scaled")
    
    stages += [num_assembler, num_scaler]

stages

[StringIndexer_de08295cb5e6,
 OneHotEncoder_8018e6f479ff,
 StringIndexer_5f4856e73326,
 OneHotEncoder_a207b9604308,
 StringIndexer_0d28339e1487,
 OneHotEncoder_b475eadf11aa,
 VectorAssembler_1ce7a312e5b2,
 StandardScaler_85efd0b3e101,
 VectorAssembler_1056f4eb0b0e,
 StandardScaler_95be13a4af89,
 VectorAssembler_92aebea70070,
 StandardScaler_9e54d6baffa7]

In [7]:
# _onehot이 붙은 컬럼과 _scaled 가 붙은 컬럼만 있으면 된다.
assembler_inputs = [c + "_onehot" for c in cat_features] + [n + "_scaled" for n in num_features]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="feature_vector")
stages += [assembler]

stages

[StringIndexer_de08295cb5e6,
 OneHotEncoder_8018e6f479ff,
 StringIndexer_5f4856e73326,
 OneHotEncoder_a207b9604308,
 StringIndexer_0d28339e1487,
 OneHotEncoder_b475eadf11aa,
 VectorAssembler_1ce7a312e5b2,
 StandardScaler_85efd0b3e101,
 VectorAssembler_1056f4eb0b0e,
 StandardScaler_95be13a4af89,
 VectorAssembler_92aebea70070,
 StandardScaler_9e54d6baffa7,
 VectorAssembler_dbe44b238708]

# 하이퍼 파라미터 튜닝

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression # Ridge, Lasso가 없고, EleasticNet을 포함

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

In [9]:
# 모델 생성
lr = LinearRegression(
    maxIter=30,
    solver='normal',
    labelCol='total_amount',
    featuresCol='feature_vector'
)

# LinearRegression 모델 까지 하나의 파이프라인으로 통합
cv_stages = stages + [lr]

In [11]:
# 파이프라인 생성
cv_pipeline = Pipeline(stages=cv_stages)

## GridSearch 및 CrossValidation 설정

In [13]:
param_grid = ParamGridBuilder()\
                .addGrid(lr.elasticNetParam, [0.1, 0.2, 0.3, 0.4, 0.5])\
                .addGrid(lr.regParam, [0.01, 0.02, 0.03, 0.04, 0.05])\
                .build()
param_grid

[{Param(parent='LinearRegression_a5c2d036fbaf', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.1,
  Param(parent='LinearRegression_a5c2d036fbaf', name='regParam', doc='regularization parameter (>= 0).'): 0.01},
 {Param(parent='LinearRegression_a5c2d036fbaf', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.1,
  Param(parent='LinearRegression_a5c2d036fbaf', name='regParam', doc='regularization parameter (>= 0).'): 0.02},
 {Param(parent='LinearRegression_a5c2d036fbaf', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.1,
  Param(parent='LinearRegression_a5c2d036fbaf', name='regParam', doc='regularization parameter (>= 0).'): 0.03},
 {Param(paren

In [14]:
cross_val = CrossValidator(
    estimator=cv_pipeline, # 파이프라인을 Estimator로 사용하는 경우 제일 마지막 stage가 모델이어야만 한다.
    estimatorParamMaps=param_grid, # 없으면 그냥 GirdSearch 없이 Cross Validation만 진행
    evaluator=RegressionEvaluator(labelCol="total_amount"),
    numFolds=5)

cross_val

CrossValidator_84fd6d9a84d8

## 훈련

In [15]:
# 임의의 샘플 데이터 세트 만들기. 전체로 다 하면 시간이 많이 걸려요..ㅠㅠ
toy_df = train_df.sample(False, 0.1, seed=1)
toy_df.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [16]:
cv_model = cross_val.fit(toy_df)

23/06/14 04:22:37 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/06/14 04:22:37 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/06/14 04:22:38 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/06/14 04:22:38 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

## BestModel 찾기

In [17]:
best_model = cv_model.bestModel

## Best Parameter 찾기

In [18]:
best_alpha = cv_model.bestModel.stages[-1]._java_obj.getElasticNetParam()
best_reg_param = cv_model.bestModel.stages[-1]._java_obj.getRegParam()

In [19]:
best_alpha, best_reg_param

(0.1, 0.04)

# 전체 데이터를 대상으로 훈련

In [20]:
pipeline = Pipeline(stages=stages) # 모델이 빠진 전처리만 하는 파이프라인 생성
fitted_transformer = pipeline.fit(train_df)

                                                                                

In [21]:
vec_train_df = fitted_transformer.transform(train_df)

In [22]:
# best parameter로 모델 생성하기
lr = LinearRegression(
    maxIter=50,
    solver="normal",
    labelCol="total_amount",
    featuresCol="feature_vector",
    elasticNetParam=best_alpha,
    regParam=best_reg_param
)

In [23]:
model = lr.fit(vec_train_df)

                                                                                

# 튜닝된 모델 저장 및 불러오기

In [31]:
# 모델이 저장될 디렉토리 지정
model_dir = "/home/ubuntu/working/spark-examples/taxi_pricing_model"
model.save(model_dir)

AttributeError: 'NoneType' object has no attribute '_jvm'

In [None]:
# 모델 로딩.
from pyspark.ml.regression import LinearRegression
loaded_model = LinearRegression().load(model_dir)

In [24]:
spark.stop()