In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("miniproject1").getOrCreate()

In [2]:
import os
cwd = os.getcwd()
trip_data_path = os.path.join(cwd, 'learning_spark_data/Crop_recommendation.csv')
trip_data_path
file_path = f"file:///{trip_data_path.replace(os.sep, '/')}"
file_path

'file:////home/jovyan/work/start_spark/learning_spark_data/Crop_recommendation.csv'

In [3]:
df = spark.read.csv(file_path, inferSchema=True, header=True)
df.printSchema()

root
 |-- N: integer (nullable = true)
 |-- P: integer (nullable = true)
 |-- K: integer (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- ph: double (nullable = true)
 |-- rainfall: double (nullable = true)
 |-- label: string (nullable = true)



In [4]:
df.show(5)

+---+---+---+-----------+-----------+-----------------+-----------+-----+
|  N|  P|  K|temperature|   humidity|               ph|   rainfall|label|
+---+---+---+-----------+-----------+-----------------+-----------+-----+
| 90| 42| 43|20.87974371|82.00274423|6.502985292000001|202.9355362| rice|
| 85| 58| 41|21.77046169|80.31964408|      7.038096361|226.6555374| rice|
| 60| 55| 44|23.00445915| 82.3207629|      7.840207144|263.9642476| rice|
| 74| 35| 40|26.49109635|80.15836264|      6.980400905|242.8640342| rice|
| 78| 42| 42|20.13017482|81.60487287|      7.628472891|262.7173405| rice|
+---+---+---+-----------+-----------+-----------------+-----------+-----+
only showing top 5 rows



In [5]:
from pyspark.sql.functions import col, sum, when, isnan
null_counts = df.select(
                    [
                        sum(when(col(c).isNull() | isnan(c), 1).otherwise(0)).alias(c)
                        for c in df.columns
                    ]
                )
null_counts.show()

+---+---+---+-----------+--------+---+--------+-----+
|  N|  P|  K|temperature|humidity| ph|rainfall|label|
+---+---+---+-----------+--------+---+--------+-----+
|  0|  0|  0|          0|       0|  0|       0|    0|
+---+---+---+-----------+--------+---+--------+-----+



In [20]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler

In [8]:
train_data, test_data = df.randomSplit([0.8,0.2], seed=64)

In [12]:
stages = []

In [13]:
labelIndexer = StringIndexer(inputCol='label',outputCol = 'labelIndexer')
stages += [labelIndexer]

In [14]:
num_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
for num in num_features:
    num_assembler = VectorAssembler(inputCols=[num], outputCol=num+'_vector')
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol=num+'_scaled')
    stages += [num_assembler, num_scaler]

In [16]:
assembler_input = [num+'_scaled' for num in num_features]

In [17]:
assembler = VectorAssembler(
    inputCols= assembler_input,
    outputCol= 'feature_vector'
)
stages += [assembler]

In [18]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
fitted_transform = pipeline.fit(train_data)
vtrain_data = fitted_transform.transform(train_data)

In [21]:
lr = LogisticRegression(featuresCol='feature_vector', labelCol='labelIndexer')
lr_model = lr.fit(vtrain_data)

In [22]:
vtest_data = fitted_transform.transform(test_data)
pred = lr_model.transform(vtest_data)

In [23]:
pred.select('labelIndexer', 'prediction').show(5)

+------------+----------+
|labelIndexer|prediction|
+------------+----------+
|         4.0|       4.0|
|         7.0|       7.0|
|         7.0|       7.0|
|        15.0|      15.0|
|        20.0|      20.0|
+------------+----------+
only showing top 5 rows



In [24]:
from pyspark.sql.functions import expr
comp = pred.withColumn('correct', expr('case when labelIndexer = prediction then 1 else 0 end'))
comp.where('correct=0').count()

9

In [25]:
comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

0.9803063457330415

In [26]:
from pyspark.sql.functions import max
max_index = vtrain_data.select(max('labelIndexer')).collect()[0][0]
for i in range(int(max_index) + 1):
    acc = comp.filter(col('labelIndexer') == i).selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']
    crop_name = vtrain_data.select('label').filter(col('labelIndexer') == i).collect()[0][0]
    print(f'{crop_name} : {acc}')

lentil : 1.0
maize : 1.0
orange : 1.0
grapes : 1.0
mango : 1.0
muskmelon : 1.0
pomegranate : 1.0
coconut : 1.0
jute : 0.9
cotton : 1.0
papaya : 0.9523809523809523
rice : 0.9047619047619048
banana : 1.0
chickpea : 0.9090909090909091
pigeonpeas : 1.0
kidneybeans : 1.0
blackgram : 0.9583333333333334
coffee : 1.0
mungbean : 1.0
watermelon : 1.0
apple : 1.0
mothbeans : 0.9642857142857143


In [28]:
from pyspark.sql.functions import col, avg, stddev
mean_val = df.select(avg("ph")).collect()[0][0]
stddev_val = df.select(stddev("ph")).collect()[0][0]

# 2. Z-score 임계값 설정
threshold = 3.0


drop_count = df.filter(
    (col("ph") < (mean_val - threshold * stddev_val)) |
    (col("ph") > (mean_val + threshold * stddev_val))
).count()
# 3. 이상치 제거
df_no_outliers_zscore = df.filter(
    (col("ph") >= (mean_val - threshold * stddev_val)) &
    (col("ph") <= (mean_val + threshold * stddev_val))
)
drop_count

30

In [29]:
ztrain_data, ztest_data = df_no_outliers_zscore.randomSplit([0.8,0.2], seed=64)

In [30]:
pipeline = Pipeline(stages=stages)
fitted_transform = pipeline.fit(ztrain_data)
vztrain_data = fitted_transform.transform(ztrain_data)

In [31]:
lr = LogisticRegression(featuresCol='feature_vector', labelCol='labelIndexer')
lr_model = lr.fit(vztrain_data)

In [32]:
vztest_data = fitted_transform.transform(ztest_data)
zpred = lr_model.transform(vztest_data)

In [33]:
zpred.select('labelIndexer', 'prediction').show(5)

+------------+----------+
|labelIndexer|prediction|
+------------+----------+
|         6.0|       6.0|
|         5.0|       5.0|
|         5.0|       5.0|
|         3.0|       3.0|
|         7.0|       7.0|
+------------+----------+
only showing top 5 rows



In [34]:
from pyspark.sql.functions import expr
zcomp = zpred.withColumn('correct', expr('case when labelIndexer = prediction then 1 else 0 end'))
zcomp.where('correct=0').count()

5

In [35]:
zcomp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

0.9889380530973452

In [36]:
for i in range(int(max_index) + 1):
    acc = zcomp.filter(col('labelIndexer') == i).selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']
    crop_name = vztrain_data.select('label').filter(col('labelIndexer') == i).collect()[0][0]
    print(f'{crop_name} : {acc}')

lentil : 1.0
maize : 1.0
grapes : 1.0
kidneybeans : 1.0
rice : 0.9444444444444444
coconut : 1.0
mango : 1.0
apple : 1.0
banana : 1.0
blackgram : 0.9523809523809523
pomegranate : 1.0
watermelon : 1.0
cotton : 1.0
papaya : 1.0
coffee : 1.0
jute : 0.875
mungbean : 1.0
muskmelon : 1.0
orange : 1.0
pigeonpeas : 1.0
chickpea : 1.0
mothbeans : 1.0
