# 조건별 작물 추천 모델링 만들기
데이터 출처 : https://www.kaggle.com/datasets/madhuraatmarambhagat/crop-recommendation-dataset?select=Crop_recommendation.csv

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("taxi-fare-prediction").getOrCreate()

In [3]:
import os
cwd = os.getcwd()
trip_data_path = os.path.join(cwd, 'learning_spark_data/trips','*.csv')
trip_data_path

'/home/jovyan/work/start_spark/learning_spark_data/trips/*.csv'

In [7]:
file_path = f"file:///{trip_data_path.replace(os.sep, '/')}"
file_path

'file:////home/jovyan/work/start_spark/learning_spark_data/trips/*.csv'

In [29]:
trip_df = spark.read.csv(file_path, inferSchema=True, header=True)
trip_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [None]:
# 운행거리에 따른 요금 예측

In [30]:
trip_df.createOrReplaceTempView('trips')

In [31]:
query = """
SELECT
    trip_distance,
    total_amount
FROM trips

WHERE total_amount < 5000
  AND total_amount > 0
  AND trip_distance > 0
  AND trip_distance < 500
  AND passenger_count < 4
  AND TO_DATE(tpep_pickup_datetime) >= "2021-01-01"
  AND TO_DATE(tpep_pickup_datetime) < "2021-08-01"
"""

In [32]:
trip_df = spark.sql(query)
trip_df.createOrReplaceTempView('data')

In [33]:
query = '''
SELECT *
FROM data
LIMIT 5
'''

spark.sql(query).show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|         16.5|       70.07|
|         1.13|       11.16|
|         2.68|       18.59|
|         12.4|        43.8|
|          9.7|        32.3|
+-------------+------------+



In [45]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

In [None]:
#train,test split 8:2 seed=1

In [36]:
train_data, test_data = trip_df.randomSplit([0.8,0.2], seed=1)

In [None]:
#vectorassembler > features : trip_distance, target:total_amount

In [42]:
vassembler = VectorAssembler(
    inputCols= ['trip_distance'],
    outputCol= 'features'
)
vtrain_data = vassembler.transform(train_data)
vtrain_data.show(5)

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|        3.05|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
+-------------+------------+--------+
only showing top 5 rows



In [None]:
#LinearRegression 생성 maxIter=50, LableCol='total_amount', featuresCol='trip_distance'
#fit
#vassembler.transform(test)
#model.transform

In [47]:
lr = LinearRegression(featuresCol='features', labelCol='total_amount', maxIter=50)
lr_model = lr.fit(vtrain_data) # 천만건데이터 학습
vtest_data = vassembler.transform(test_data)
pred = lr_model.transform(vtest_data) # 2백만건데이터 예측

In [49]:
pred.show(5)

+-------------+------------+--------+-----------------+
|trip_distance|total_amount|features|       prediction|
+-------------+------------+--------+-----------------+
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
+-------------+------------+--------+-----------------+
only showing top 5 rows



In [50]:
lr_model.summary.rootMeanSquaredError #RMSE

6.30781413196623

In [51]:
lr_model.summary.r2

0.7648633777017714

In [None]:
#새로운 데이터로 예측하기

In [56]:
from pyspark.sql.types import DoubleType
new_distance_list = [1.1, 5.4, 10.2, 30.0]
distanse_df = spark.createDataFrame(new_distance_list,DoubleType()).toDF('trip_distance')
distanse_df.show()

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.4|
|         10.2|
|         30.0|
+-------------+



In [57]:
vdistance_df = vassembler.transform(distanse_df)
lr_model.transform(vdistance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|12.672809485363317|
|          5.4|   [5.4]|25.463805432351194|
|         10.2|  [10.2]| 39.74212648945393|
|         30.0|  [30.0]| 98.64020085000274|
+-------------+--------+------------------+



In [58]:
spark.stop()