# 조건별 작물 추천 모델링 만들기
데이터 출처 : https://www.kaggle.com/datasets/madhuraatmarambhagat/crop-recommendation-dataset?select=Crop_recommendation.csv

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("miniproject1").getOrCreate()

In [62]:
file_path = 'file:////home/jovyan/work/start_spark/learning_spark_data/Crop_recommendation.csv'
file_path

'file:////home/jovyan/work/start_spark/learning_spark_data/Crop_recommendation.csv'

In [80]:
df = spark.read.csv(file_path, inferSchema=True, header=True)
df.printSchema()

root
 |-- N: integer (nullable = true)
 |-- P: integer (nullable = true)
 |-- K: integer (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- ph: double (nullable = true)
 |-- rainfall: double (nullable = true)
 |-- label: string (nullable = true)



In [81]:
df.show(5)

+---+---+---+-----------+-----------+-----------------+-----------+-----+
|  N|  P|  K|temperature|   humidity|               ph|   rainfall|label|
+---+---+---+-----------+-----------+-----------------+-----------+-----+
| 90| 42| 43|20.87974371|82.00274423|6.502985292000001|202.9355362| rice|
| 85| 58| 41|21.77046169|80.31964408|      7.038096361|226.6555374| rice|
| 60| 55| 44|23.00445915| 82.3207629|      7.840207144|263.9642476| rice|
| 74| 35| 40|26.49109635|80.15836264|      6.980400905|242.8640342| rice|
| 78| 42| 42|20.13017482|81.60487287|      7.628472891|262.7173405| rice|
+---+---+---+-----------+-----------+-----------------+-----------+-----+
only showing top 5 rows



In [82]:
from pyspark.sql.functions import col, sum, when, isnan
null_counts = df.select(
                    [
                        sum(when(col(c).isNull() | isnan(c), 1).otherwise(0)).alias(c)
                        for c in df.columns
                    ]
                )
null_counts.show()

+---+---+---+-----------+--------+---+--------+-----+
|  N|  P|  K|temperature|humidity| ph|rainfall|label|
+---+---+---+-----------+--------+---+--------+-----+
|  0|  0|  0|          0|       0|  0|       0|    0|
+---+---+---+-----------+--------+---+--------+-----+



In [83]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler

In [85]:
stages = []

In [187]:
train_data, test_data = df.randomSplit([0.8,0.2], seed=64)

In [88]:
labelIndexer = StringIndexer(inputCol='label',outputCol = 'labelIndexer')
stages += [labelIndexer]

In [89]:
num_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
for num in num_features:
    num_assembler = VectorAssembler(inputCols=[num], outputCol=num+'_vector')
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol=num+'_scaled')
    stages += [num_assembler, num_scaler]
stages

[StringIndexer_ec9f2ef1b4ad,
 VectorAssembler_4b4cb17fe71a,
 StandardScaler_56fb7376332d,
 VectorAssembler_d5ebcb6a1ffe,
 StandardScaler_737fd4a97883,
 VectorAssembler_d65173a6dbd6,
 StandardScaler_5c47917eff3a,
 VectorAssembler_83906c6a6bf2,
 StandardScaler_b811d6da7a52,
 VectorAssembler_3317a0069223,
 StandardScaler_3932966b61fd,
 VectorAssembler_78ac2345bca2,
 StandardScaler_07fcd7806962,
 VectorAssembler_c98b326aa56b,
 StandardScaler_3ee5cadf4cdb]

In [90]:
assembler_input = [num+'_scaled' for num in num_features]
assembler_input

['N_scaled',
 'P_scaled',
 'K_scaled',
 'temperature_scaled',
 'humidity_scaled',
 'ph_scaled',
 'rainfall_scaled']

In [91]:
assembler = VectorAssembler(
    inputCols= assembler_input,
    outputCol= 'feature_vector'
)
stages += [assembler]
stages

[StringIndexer_ec9f2ef1b4ad,
 VectorAssembler_4b4cb17fe71a,
 StandardScaler_56fb7376332d,
 VectorAssembler_d5ebcb6a1ffe,
 StandardScaler_737fd4a97883,
 VectorAssembler_d65173a6dbd6,
 StandardScaler_5c47917eff3a,
 VectorAssembler_83906c6a6bf2,
 StandardScaler_b811d6da7a52,
 VectorAssembler_3317a0069223,
 StandardScaler_3932966b61fd,
 VectorAssembler_78ac2345bca2,
 StandardScaler_07fcd7806962,
 VectorAssembler_c98b326aa56b,
 StandardScaler_3ee5cadf4cdb,
 VectorAssembler_ee2ec5dfc6c7]

In [236]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
fitted_transform = pipeline.fit(train_data)
vtrain_data = fitted_transform.transform(train_data)
vtrain_data.printSchema()

root
 |-- N: integer (nullable = true)
 |-- P: integer (nullable = true)
 |-- K: integer (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- ph: double (nullable = true)
 |-- rainfall: double (nullable = true)
 |-- label: string (nullable = true)
 |-- labelIndexer: double (nullable = false)
 |-- N_vector: vector (nullable = true)
 |-- N_scaled: vector (nullable = true)
 |-- P_vector: vector (nullable = true)
 |-- P_scaled: vector (nullable = true)
 |-- K_vector: vector (nullable = true)
 |-- K_scaled: vector (nullable = true)
 |-- temperature_vector: vector (nullable = true)
 |-- temperature_scaled: vector (nullable = true)
 |-- humidity_vector: vector (nullable = true)
 |-- humidity_scaled: vector (nullable = true)
 |-- ph_vector: vector (nullable = true)
 |-- ph_scaled: vector (nullable = true)
 |-- rainfall_vector: vector (nullable = true)
 |-- rainfall_scaled: vector (nullable = true)
 |-- feature_vector: vector (nullable = true

In [135]:
vtrain_data.select('feature_vector', 'labelIndexer').show(3)

+--------------------+------------+
|      feature_vector|labelIndexer|
+--------------------+------------+
|[0.0,0.1494661232...|        17.0|
|[0.0,0.3587186958...|         0.0|
|[0.0,0.5081848190...|         5.0|
+--------------------+------------+
only showing top 3 rows



In [136]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [237]:
lr = LogisticRegression(featuresCol='feature_vector', labelCol='labelIndexer')
lr_model = lr.fit(vtrain_data)

In [239]:
#테스트데이터도 변환
vtest_data = fitted_transform.transform(test_data)
#테스트데이터로 예측
pred = lr_model.transform(vtest_data)

In [240]:
pred.select('labelIndexer', 'prediction').show(5)

+------------+----------+
|labelIndexer|prediction|
+------------+----------+
|         7.0|       7.0|
|         5.0|       5.0|
|         5.0|       5.0|
|         3.0|       3.0|
|         8.0|       8.0|
+------------+----------+
only showing top 5 rows



In [241]:
from pyspark.sql.functions import expr
comp = pred.withColumn('correct', expr('case when labelIndexer = prediction then 1 else 0 end'))
comp.where('correct=0').count()

11

In [242]:
pred.select('N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall','label','labelIndexer','prediction').filter(col('labelIndexer') != col('prediction')).show()

+---+---+---+-----------+-----------------+------------------+-----------+-----------+------------+----------+
|  N|  P|  K|temperature|         humidity|                ph|   rainfall|      label|labelIndexer|prediction|
+---+---+---+-----------+-----------------+------------------+-----------+-----------+------------+----------+
| 11| 78| 22|23.89756791|      22.74378977|       5.940546818|112.6616435|kidneybeans|         3.0|      17.0|
| 18| 74| 15| 24.9035819|      22.27512704|        5.70836603|146.4727237|kidneybeans|         3.0|      17.0|
| 18| 79| 20|20.27514686|       23.2353604|       5.877347515|139.7521543|kidneybeans|         3.0|      17.0|
| 22| 55| 20|33.95309131|      69.96100028|       7.423530351|61.16350463|  blackgram|         9.0|      15.0|
| 63| 37| 43|23.41798979|      85.08640476| 6.661957897000001|185.7446728|       jute|         2.0|      20.0|
| 63| 41| 45|25.29781791|86.88705350000002|7.1219335789999985|196.6249511|       jute|         2.0|      20.0|
|

In [243]:
comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

0.9753363228699552

In [224]:
vtrain_data.select('label').filter(col('labelIndexer') == 0).collect()[0][0]

'lentil'

In [244]:
from pyspark.sql.functions import max
max_index = vtrain_data.select(max('labelIndexer')).collect()[0][0]
for i in range(int(max_index) + 1):
    acc = comp.filter(col('labelIndexer') == i).selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']
    crop_name = vtrain_data.select('label').filter(col('labelIndexer') == i).collect()[0][0]
    print(f'{crop_name} : {acc}')
# comp.filter(col('labelIndexer') == 0).selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

lentil : 1.0
grapes : 1.0
jute : 0.75
kidneybeans : 0.8235294117647058
maize : 1.0
coconut : 1.0
cotton : 1.0
mango : 1.0
apple : 1.0
blackgram : 0.9523809523809523
pomegranate : 1.0
muskmelon : 1.0
watermelon : 1.0
banana : 1.0
papaya : 1.0
mungbean : 1.0
orange : 1.0
pigeonpeas : 1.0
chickpea : 1.0
coffee : 1.0
rice : 0.8
mothbeans : 1.0


In [152]:
quantiles = df.approxQuantile("rainfall", [0.25, 0.75], 0.01)
Q1 = quantiles[0]
Q3 = quantiles[1]

# 2. IQR 계산
IQR = Q3 - Q1

# 3. 경계값 설정
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
drop_count = df.filter((col("rainfall") < lower_bound) | (col("rainfall") > upper_bound)).count()
df_no_outliers = df.filter((col("rainfall") >= lower_bound) & (col("rainfall") <= upper_bound))
drop_count
# df_no_outliers.show()

112

In [158]:
from pyspark.sql.functions import col, avg, stddev
# 1. 평균 및 표준편차 계산
mean_val = df.select(avg("rainfall")).collect()[0][0]
stddev_val = df.select(stddev("rainfall")).collect()[0][0]

# 2. Z-score 임계값 설정
threshold = 3.0


drop_count = df.filter(
    (col("rainfall") < (mean_val - threshold * stddev_val)) |
    (col("rainfall") > (mean_val + threshold * stddev_val))
).count()
# 3. 이상치 제거
df_no_outliers_zscore = df.filter(
    (col("rainfall") >= (mean_val - threshold * stddev_val)) &
    (col("rainfall") <= (mean_val + threshold * stddev_val))
)
drop_count
# df_no_outliers_zscore.show()

22

In [226]:
mean_val = df.select(avg("ph")).collect()[0][0]
stddev_val = df.select(stddev("ph")).collect()[0][0]

# 2. Z-score 임계값 설정
threshold = 3.0


drop_count = df.filter(
    (col("ph") < (mean_val - threshold * stddev_val)) |
    (col("ph") > (mean_val + threshold * stddev_val))
).count()
# 3. 이상치 제거
df_no_outliers_zscore = df.filter(
    (col("ph") >= (mean_val - threshold * stddev_val)) &
    (col("ph") <= (mean_val + threshold * stddev_val))
)
drop_count

30

In [235]:
train_data, test_data = df_no_outliers_zscore.randomSplit([0.8,0.2], seed=64)

In [234]:
num_features = ['ph', 'rainfall']
df_no_outliers_zscore = df
for num in num_features:
    mean_val = df_no_outliers_zscore.select(avg(num)).collect()[0][0]
    stddev_val = df_no_outliers_zscore.select(stddev(num)).collect()[0][0]
    
    # 2. Z-score 임계값 설정
    threshold = 3.0
    
    
    # 3. 이상치 제거
    df_no_outliers_zscore = df_no_outliers_zscore.filter(
        (col(num) >= (mean_val - threshold * stddev_val)) &
        (col(num) <= (mean_val + threshold * stddev_val))
    )
df.count() - df_no_outliers_zscore.count()

52

In [186]:
num_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
df_no_outliers_zscore = df
for num in num_features:
    mean_val = df.select(avg(num)).collect()[0][0]
    stddev_val = df.select(stddev(num)).collect()[0][0]
    
    # 2. Z-score 임계값 설정
    threshold = 3.0
    
    
    # 3. 이상치 제거
    df_no_outliers_zscore = df.filter(
        (col(num) >= (mean_val - threshold * stddev_val)) &
        (col(num) <= (mean_val + threshold * stddev_val))
    )
    train_data, test_data = df_no_outliers_zscore.randomSplit([0.8,0.2], seed=64)
    
    pipeline = Pipeline(stages=stages)
    fitted_transform = pipeline.fit(train_data)
    vtrain_data = fitted_transform.transform(train_data)
    lr = LogisticRegression(featuresCol='feature_vector', labelCol='labelIndexer')
    lr_model = lr.fit(vtrain_data)
    #테스트데이터도 변환
    vtest_data = fitted_transform.transform(test_data)
    #테스트데이터로 예측
    pred = lr_model.transform(vtest_data)
    comp = pred.withColumn('correct', expr('case when labelIndexer = prediction then 1 else 0 end'))
    comp.where('correct=0').count()
    acc = comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']
    print(f'{num} : {acc}')

N : 0.9803063457330415
P : 0.9803063457330415
K : 0.9839080459770115
temperature : 0.9844789356984479
humidity : 0.9803063457330415
ph : 0.9889380530973452
rainfall : 0.9845474613686535


In [176]:
num_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
df_no_outliers = df
for num in num_features:
    quantiles = df_no_outliers.approxQuantile(num, [0.25, 0.75], 0.01)
    Q1 = quantiles[0]
    Q3 = quantiles[1]
    
    # 2. IQR 계산
    IQR = Q3 - Q1
    
    # 3. 경계값 설정
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_no_outliers = df_no_outliers.filter((col(num) >= lower_bound) & (col(num) <= upper_bound))

df.count() - df_no_outliers.count()

398

In [247]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1. 'ph' 컬럼 이상치 제거 (Z-score 사용)

# 'ph' 컬럼의 평균과 표준편차 계산
mean_ph = df.agg({'ph': 'mean'}).collect()[0][0]
stddev_ph = df.agg({'ph': 'stddev'}).collect()[0][0]
print(f"ph 평균: {mean_ph}, ph 표준편차: {stddev_ph}")

# Z-score 계산 및 이상치 경계값 설정 (Z-score가 3보다 크면 이상치로 간주)
upper_bound_ph = mean_ph + 3 * stddev_ph
lower_bound_ph = mean_ph - 3 * stddev_ph
print(f"ph 상한선: {upper_bound_ph}, ph 하한선: {lower_bound_ph}")

# 이상치 제거
df_filtered = df.filter((col('ph') >= lower_bound_ph) & (col('ph') <= upper_bound_ph))
print(f"원래 데이터 수: {df.count()}, 'ph' 이상치 제거 후 데이터 수: {df_filtered.count()}")

# -----------------------------------------------------------

# 2. Random Forest 모델 학습 및 평가

# Feature와 Label 준비
feature_cols = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_assembled = assembler.transform(df_filtered)

# Label 인덱싱 (문자열 라벨을 숫자로 변환)
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
df_indexed = indexer.fit(df_assembled).transform(df_assembled)

# 데이터 분할 (훈련: 80%, 테스트: 20%)
train_data, test_data = df_indexed.randomSplit([0.8, 0.2], seed=42)

# Random Forest 모델 생성 및 학습
rf = RandomForestClassifier(featuresCol="features", labelCol="indexedLabel", numTrees=100)
rf_model = rf.fit(train_data)

# 테스트 데이터에 모델 적용
predictions = rf_model.transform(test_data)

# 정확도 평가
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"모델 정확도 (Random Forest, ph Z-score 이상치 제거): {accuracy}")

# -----------------------------------------------------------

# 3. 작물별 정확도 분석

# 작물별 평가를 위한 Evaluator 설정
label_list = sorted(list(df.select("label").distinct().rdd.flatMap(lambda x: x).collect()))
indexer_model = indexer.fit(df_assembled)

# 각 작물에 대한 정확도 계산
print("\n=== 작물별 정확도 ===")
for crop_label in label_list:
    # 해당 작물의 인덱스 찾기
    label_index = indexer_model.transform(spark.createDataFrame([(crop_label,)], ["label"])).select("indexedLabel").collect()[0][0]

    # 특정 작물에 대한 예측 필터링
    crop_predictions = predictions.filter(col("indexedLabel") == label_index)

    # 해당 작물의 정확도 계산
    if crop_predictions.count() > 0:
        correct_predictions_count = crop_predictions.filter(col("indexedLabel") == col("prediction")).count()
        accuracy_crop = correct_predictions_count / crop_predictions.count()
        print(f"작물 '{crop_label}': {accuracy_crop}")

ph 평균: 6.469480065256369, ph 표준편차: 0.7739376880298732
ph 상한선: 8.791293129345988, ph 하한선: 4.14766700116675
원래 데이터 수: 2200, 'ph' 이상치 제거 후 데이터 수: 2170
모델 정확도 (Random Forest, ph Z-score 이상치 제거): 0.9712793733681462

=== 작물별 정확도 ===
작물 'apple': 1.0
작물 'banana': 1.0
작물 'blackgram': 1.0
작물 'chickpea': 1.0
작물 'coconut': 1.0
작물 'coffee': 1.0
작물 'cotton': 1.0
작물 'grapes': 1.0
작물 'jute': 1.0
작물 'kidneybeans': 1.0
작물 'lentil': 1.0
작물 'maize': 1.0
작물 'mango': 1.0
작물 'mothbeans': 0.6363636363636364
작물 'mungbean': 1.0
작물 'muskmelon': 1.0
작물 'orange': 1.0
작물 'papaya': 1.0
작물 'pigeonpeas': 0.75
작물 'pomegranate': 1.0
작물 'rice': 0.8571428571428571
작물 'watermelon': 1.0


In [248]:
spark.stop()