In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("miniproject1").getOrCreate()

In [62]:
import os
cwd = os.getcwd()
trip_data_path = os.path.join(cwd, 'learning_spark_data/Crop_recommendation.csv')
trip_data_path
file_path = f"file:///{trip_data_path.replace(os.sep, '/')}"
file_path

'file:////home/jovyan/work/start_spark/learning_spark_data/Crop_recommendation.csv'

In [80]:
df = spark.read.csv(file_path, inferSchema=True, header=True)
df.printSchema()

root
 |-- N: integer (nullable = true)
 |-- P: integer (nullable = true)
 |-- K: integer (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- ph: double (nullable = true)
 |-- rainfall: double (nullable = true)
 |-- label: string (nullable = true)



In [81]:
df.show(5)

+---+---+---+-----------+-----------+-----------------+-----------+-----+
|  N|  P|  K|temperature|   humidity|               ph|   rainfall|label|
+---+---+---+-----------+-----------+-----------------+-----------+-----+
| 90| 42| 43|20.87974371|82.00274423|6.502985292000001|202.9355362| rice|
| 85| 58| 41|21.77046169|80.31964408|      7.038096361|226.6555374| rice|
| 60| 55| 44|23.00445915| 82.3207629|      7.840207144|263.9642476| rice|
| 74| 35| 40|26.49109635|80.15836264|      6.980400905|242.8640342| rice|
| 78| 42| 42|20.13017482|81.60487287|      7.628472891|262.7173405| rice|
+---+---+---+-----------+-----------+-----------------+-----------+-----+
only showing top 5 rows



In [82]:
from pyspark.sql.functions import col, sum, when, isnan
null_counts = df.select(
                    [
                        sum(when(col(c).isNull() | isnan(c), 1).otherwise(0)).alias(c)
                        for c in df.columns
                    ]
                )
null_counts.show()

+---+---+---+-----------+--------+---+--------+-----+
|  N|  P|  K|temperature|humidity| ph|rainfall|label|
+---+---+---+-----------+--------+---+--------+-----+
|  0|  0|  0|          0|       0|  0|       0|    0|
+---+---+---+-----------+--------+---+--------+-----+



In [83]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler

In [85]:
stages = []

In [187]:
train_data, test_data = df.randomSplit([0.8,0.2], seed=64)

In [88]:
labelIndexer = StringIndexer(inputCol='label',outputCol = 'labelIndexer')
stages += [labelIndexer]

In [89]:
num_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
for num in num_features:
    num_assembler = VectorAssembler(inputCols=[num], outputCol=num+'_vector')
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol=num+'_scaled')
    stages += [num_assembler, num_scaler]
stages

[StringIndexer_ec9f2ef1b4ad,
 VectorAssembler_4b4cb17fe71a,
 StandardScaler_56fb7376332d,
 VectorAssembler_d5ebcb6a1ffe,
 StandardScaler_737fd4a97883,
 VectorAssembler_d65173a6dbd6,
 StandardScaler_5c47917eff3a,
 VectorAssembler_83906c6a6bf2,
 StandardScaler_b811d6da7a52,
 VectorAssembler_3317a0069223,
 StandardScaler_3932966b61fd,
 VectorAssembler_78ac2345bca2,
 StandardScaler_07fcd7806962,
 VectorAssembler_c98b326aa56b,
 StandardScaler_3ee5cadf4cdb]

In [90]:
assembler_input = [num+'_scaled' for num in num_features]
assembler_input

['N_scaled',
 'P_scaled',
 'K_scaled',
 'temperature_scaled',
 'humidity_scaled',
 'ph_scaled',
 'rainfall_scaled']

In [91]:
assembler = VectorAssembler(
    inputCols= assembler_input,
    outputCol= 'feature_vector'
)
stages += [assembler]
stages

[StringIndexer_ec9f2ef1b4ad,
 VectorAssembler_4b4cb17fe71a,
 StandardScaler_56fb7376332d,
 VectorAssembler_d5ebcb6a1ffe,
 StandardScaler_737fd4a97883,
 VectorAssembler_d65173a6dbd6,
 StandardScaler_5c47917eff3a,
 VectorAssembler_83906c6a6bf2,
 StandardScaler_b811d6da7a52,
 VectorAssembler_3317a0069223,
 StandardScaler_3932966b61fd,
 VectorAssembler_78ac2345bca2,
 StandardScaler_07fcd7806962,
 VectorAssembler_c98b326aa56b,
 StandardScaler_3ee5cadf4cdb,
 VectorAssembler_ee2ec5dfc6c7]

In [229]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
fitted_transform = pipeline.fit(train_data)
vtrain_data = fitted_transform.transform(train_data)
vtrain_data.printSchema()

root
 |-- N: integer (nullable = true)
 |-- P: integer (nullable = true)
 |-- K: integer (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- ph: double (nullable = true)
 |-- rainfall: double (nullable = true)
 |-- label: string (nullable = true)
 |-- labelIndexer: double (nullable = false)
 |-- N_vector: vector (nullable = true)
 |-- N_scaled: vector (nullable = true)
 |-- P_vector: vector (nullable = true)
 |-- P_scaled: vector (nullable = true)
 |-- K_vector: vector (nullable = true)
 |-- K_scaled: vector (nullable = true)
 |-- temperature_vector: vector (nullable = true)
 |-- temperature_scaled: vector (nullable = true)
 |-- humidity_vector: vector (nullable = true)
 |-- humidity_scaled: vector (nullable = true)
 |-- ph_vector: vector (nullable = true)
 |-- ph_scaled: vector (nullable = true)
 |-- rainfall_vector: vector (nullable = true)
 |-- rainfall_scaled: vector (nullable = true)
 |-- feature_vector: vector (nullable = true

In [135]:
vtrain_data.select('feature_vector', 'labelIndexer').show(3)

+--------------------+------------+
|      feature_vector|labelIndexer|
+--------------------+------------+
|[0.0,0.1494661232...|        17.0|
|[0.0,0.3587186958...|         0.0|
|[0.0,0.5081848190...|         5.0|
+--------------------+------------+
only showing top 3 rows



In [136]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [230]:
lr = LogisticRegression(featuresCol='feature_vector', labelCol='labelIndexer')
lr_model = lr.fit(vtrain_data)

In [231]:
#테스트데이터도 변환
vtest_data = fitted_transform.transform(test_data)
#테스트데이터로 예측
pred = lr_model.transform(vtest_data)

In [191]:
pred.select('labelIndexer', 'prediction').show(5)

+------------+----------+
|labelIndexer|prediction|
+------------+----------+
|         4.0|       4.0|
|         7.0|       7.0|
|         7.0|       7.0|
|        15.0|      15.0|
|        20.0|      20.0|
+------------+----------+
only showing top 5 rows



In [232]:
from pyspark.sql.functions import expr
comp = pred.withColumn('correct', expr('case when labelIndexer = prediction then 1 else 0 end'))
comp.where('correct=0').count()

5

In [204]:
pred.select('N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall','label','labelIndexer','prediction').filter(col('labelIndexer') != col('prediction')).show()

+---+---+---+-----------+-----------+-----------------+------------------+---------+------------+----------+
|  N|  P|  K|temperature|   humidity|               ph|          rainfall|    label|labelIndexer|prediction|
+---+---+---+-----------+-----------+-----------------+------------------+---------+------------+----------+
| 20| 68| 23|25.54960633|63.95425534|      7.707332484|        63.1830529|blackgram|        16.0|       0.0|
| 31| 70| 77|20.88818675|14.32313811|      6.492546046|       90.46228334| chickpea|        13.0|      15.0|
| 32| 73| 81|20.45078582|15.40312102|5.988992796000002|       92.68373702| chickpea|        13.0|      15.0|
| 36| 58| 25|28.66024187| 59.3189118|8.399135957999999|       36.92629678|mothbeans|        21.0|       0.0|
| 68| 52| 49|24.42561272|92.27749066|      6.577192175|       63.35298768|   papaya|        10.0|      19.0|
| 78| 43| 42|21.32376327|83.00320459|7.283736617000001|       192.3197536|     rice|        11.0|       8.0|
| 82| 48| 36|25.793

In [200]:
comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

0.9803063457330415

In [224]:
vtrain_data.select('label').filter(col('labelIndexer') == 0).collect()[0][0]

'lentil'

In [233]:
from pyspark.sql.functions import max
max_index = vtrain_data.select(max('labelIndexer')).collect()[0][0]
for i in range(int(max_index) + 1):
    acc = comp.filter(col('labelIndexer') == i).selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']
    crop_name = vtrain_data.select('label').filter(col('labelIndexer') == i).collect()[0][0]
    print(f'{crop_name} : {acc}')
# comp.filter(col('labelIndexer') == 0).selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

lentil : 1.0
maize : 1.0
grapes : 1.0
kidneybeans : 1.0
rice : 0.9444444444444444
coconut : 1.0
mango : 1.0
apple : 1.0
banana : 1.0
blackgram : 0.9523809523809523
pomegranate : 1.0
watermelon : 1.0
cotton : 1.0
papaya : 1.0
coffee : 1.0
jute : 0.875
mungbean : 1.0
muskmelon : 1.0
orange : 1.0
pigeonpeas : 1.0
chickpea : 1.0
mothbeans : 1.0


In [152]:
quantiles = df.approxQuantile("rainfall", [0.25, 0.75], 0.01)
Q1 = quantiles[0]
Q3 = quantiles[1]

# 2. IQR 계산
IQR = Q3 - Q1

# 3. 경계값 설정
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
drop_count = df.filter((col("rainfall") < lower_bound) | (col("rainfall") > upper_bound)).count()
df_no_outliers = df.filter((col("rainfall") >= lower_bound) & (col("rainfall") <= upper_bound))
drop_count
# df_no_outliers.show()

112

In [158]:
from pyspark.sql.functions import col, avg, stddev
# 1. 평균 및 표준편차 계산
mean_val = df.select(avg("rainfall")).collect()[0][0]
stddev_val = df.select(stddev("rainfall")).collect()[0][0]

# 2. Z-score 임계값 설정
threshold = 3.0


drop_count = df.filter(
    (col("rainfall") < (mean_val - threshold * stddev_val)) |
    (col("rainfall") > (mean_val + threshold * stddev_val))
).count()
# 3. 이상치 제거
df_no_outliers_zscore = df.filter(
    (col("rainfall") >= (mean_val - threshold * stddev_val)) &
    (col("rainfall") <= (mean_val + threshold * stddev_val))
)
drop_count
# df_no_outliers_zscore.show()

22

In [226]:
mean_val = df.select(avg("ph")).collect()[0][0]
stddev_val = df.select(stddev("ph")).collect()[0][0]

# 2. Z-score 임계값 설정
threshold = 3.0


drop_count = df.filter(
    (col("ph") < (mean_val - threshold * stddev_val)) |
    (col("ph") > (mean_val + threshold * stddev_val))
).count()
# 3. 이상치 제거
df_no_outliers_zscore = df.filter(
    (col("ph") >= (mean_val - threshold * stddev_val)) &
    (col("ph") <= (mean_val + threshold * stddev_val))
)
drop_count

30

In [228]:
train_data, test_data = df_no_outliers_zscore.randomSplit([0.8,0.2], seed=64)

In [175]:
num_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
df_no_outliers_zscore = df
for num in num_features:
    mean_val = df_no_outliers_zscore.select(avg(num)).collect()[0][0]
    stddev_val = df_no_outliers_zscore.select(stddev(num)).collect()[0][0]
    
    # 2. Z-score 임계값 설정
    threshold = 3.0
    
    
    # 3. 이상치 제거
    df_no_outliers_zscore = df_no_outliers_zscore.filter(
        (col(num) >= (mean_val - threshold * stddev_val)) &
        (col(num) <= (mean_val + threshold * stddev_val))
    )
df.count() - df_no_outliers_zscore.count()

170

In [186]:
num_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
df_no_outliers_zscore = df
for num in num_features:
    mean_val = df.select(avg(num)).collect()[0][0]
    stddev_val = df.select(stddev(num)).collect()[0][0]
    
    # 2. Z-score 임계값 설정
    threshold = 3.0
    
    
    # 3. 이상치 제거
    df_no_outliers_zscore = df.filter(
        (col(num) >= (mean_val - threshold * stddev_val)) &
        (col(num) <= (mean_val + threshold * stddev_val))
    )
    train_data, test_data = df_no_outliers_zscore.randomSplit([0.8,0.2], seed=64)
    
    pipeline = Pipeline(stages=stages)
    fitted_transform = pipeline.fit(train_data)
    vtrain_data = fitted_transform.transform(train_data)
    lr = LogisticRegression(featuresCol='feature_vector', labelCol='labelIndexer')
    lr_model = lr.fit(vtrain_data)
    #테스트데이터도 변환
    vtest_data = fitted_transform.transform(test_data)
    #테스트데이터로 예측
    pred = lr_model.transform(vtest_data)
    comp = pred.withColumn('correct', expr('case when labelIndexer = prediction then 1 else 0 end'))
    comp.where('correct=0').count()
    acc = comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']
    print(f'{num} : {acc}')

N : 0.9803063457330415
P : 0.9803063457330415
K : 0.9839080459770115
temperature : 0.9844789356984479
humidity : 0.9803063457330415
ph : 0.9889380530973452
rainfall : 0.9845474613686535


In [176]:
num_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
df_no_outliers = df
for num in num_features:
    quantiles = df_no_outliers.approxQuantile(num, [0.25, 0.75], 0.01)
    Q1 = quantiles[0]
    Q3 = quantiles[1]
    
    # 2. IQR 계산
    IQR = Q3 - Q1
    
    # 3. 경계값 설정
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_no_outliers = df_no_outliers.filter((col(num) >= lower_bound) & (col(num) <= upper_bound))

df.count() - df_no_outliers.count()

398