<a href="https://colab.research.google.com/github/PhuriphatSei/Crop_yield_Prediction/blob/main/src_CPY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=eb8bd6c872d89961c2031846b9ee7573fa4d74a67139d40769a1f648dfc11544
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.ml.regression import LinearRegression
import pandas as pd
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Read CSV").getOrCreate()

In [None]:
df = spark.read.options(delimiter=",").csv("data.csv", header=True, inferSchema=True)
df.show()

+----------+------------+--------------+-------------+--------------+----------+--------------+
|Fertilizer|Nitrogen (N)|Phosphorus (P)|Potassium (K)|Rain Fall (mm)|Temperatue|Yeild (Q/acre)|
+----------+------------+--------------+-------------+--------------+----------+--------------+
|        50|          59|            19|           15|           410|        37|           6.0|
|        50|          60|            18|           15|           400|        39|           6.0|
|        50|          60|            18|           15|           400|        40|           6.0|
|        50|          60|            18|           15|           450|        37|           6.0|
|        50|          65|            18|           19|           425|        37|           9.0|
|        50|          65|            18|           19|           450|        37|           9.0|
|        50|          65|            18|           19|           500|        37|           9.0|
|        52|          62|            19|

In [None]:
selected_columns = ['Fertilizer', 'Nitrogen (N)', 'Phosphorus (P)', 'Potassium (K)', 'Rain Fall (mm)', 'Temperatue', 'Yeild (Q/acre)']
df_selected = df.select(*selected_columns)
assembler = VectorAssembler(inputCols=selected_columns, outputCol='features')
df_assembled = assembler.transform(df_selected)


In [None]:
correlation_matrix = Correlation.corr(df_assembled, 'features').collect()[0][0]
correlation_matrix_df = pd.DataFrame(correlation_matrix.toArray(), columns=selected_columns, index=selected_columns)
correlation_matrix_df

Unnamed: 0,Fertilizer,Nitrogen (N),Phosphorus (P),Potassium (K),Rain Fall (mm),Temperatue,Yeild (Q/acre)
Fertilizer,1.0,0.904937,0.816096,0.751365,0.8854,-0.863135,0.824444
Nitrogen (N),0.904937,1.0,0.779447,0.800065,0.871885,-0.84832,0.87273
Phosphorus (P),0.816096,0.779447,1.0,0.747676,0.772468,-0.776384,0.79207
Potassium (K),0.751365,0.800065,0.747676,1.0,0.804093,-0.775561,0.85134
Rain Fall (mm),0.8854,0.871885,0.772468,0.804093,1.0,-0.967736,0.882383
Temperatue,-0.863135,-0.84832,-0.776384,-0.775561,-0.967736,1.0,-0.893005
Yeild (Q/acre),0.824444,0.87273,0.79207,0.85134,0.882383,-0.893005,1.0


LinearRegression

In [None]:
label_column = 'Yeild (Q/acre)'

lr = LinearRegression(featuresCol='features', labelCol=label_column, maxIter=10, regParam=0.3, elasticNetParam=0.8)
param_grid = (ParamGridBuilder()
              .addGrid(lr.regParam, [0.01, 0.1, 0.5])
              .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
              .build())


evaluator = RegressionEvaluator(labelCol=label_column, predictionCol='prediction', metricName='rmse')

cross_validator = CrossValidator(estimator=lr,
                                 estimatorParamMaps=param_grid,
                                 evaluator=evaluator,
                                 numFolds=5,
                                 seed=42)
cv_model = cross_validator.fit(df_assembled)

best_lr_model = cv_model.bestModel

predictions = best_lr_model.transform(df_assembled)

rmse = evaluator.evaluate(predictions)
r2_linear = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})


In [None]:

print("Best Model Parameters: regParam={}, elasticNetParam={}".format(best_lr_model.getRegParam(), best_lr_model.getElasticNetParam()))
print("RMSE: %.3f" % rmse)
print("R2: %.3f" % r2_linear)

Best Model Parameters: regParam=0.01, elasticNetParam=0.0
RMSE: 0.027
R2: 1.000


RandomForestRegressor

In [None]:
rf = RandomForestRegressor(featuresCol='features', labelCol=label_column, numTrees=100, maxDepth=5, seed=42)

rfModel = rf.fit(df_assembled)

param_grid = (ParamGridBuilder()
              .addGrid(rf.numTrees, [50, 100, 150])
              .addGrid(rf.maxDepth, [5, 10, 15])
              .build())

evaluator = RegressionEvaluator(labelCol=label_column, predictionCol='prediction', metricName='rmse')

cross_validator = CrossValidator(estimator=rf,
                                 estimatorParamMaps=param_grid,
                                 evaluator=evaluator,
                                 numFolds=5,
                                 seed=42)

cv_model = cross_validator.fit(df_assembled)

best_rf_model = cv_model.bestModel

predictions = best_rf_model.transform(df_assembled)

rmse = evaluator.evaluate(predictions)
r2_RF = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})


In [None]:
print("Best Model Parameters: numTrees={}, maxDepth={}".format(best_rf_model.getNumTrees, best_rf_model.getMaxDepth))
print("RMSE: %.3f" % rmse)
print("R2: %.3f" % r2_RF)

Best Model Parameters: numTrees=50, maxDepth=<bound method _DecisionTreeParams.getMaxDepth of RandomForestRegressionModel: uid=RandomForestRegressor_aa9fa2a4fcb0, numTrees=50, numFeatures=7>
RMSE: 0.094
R2: 0.998


KNeighborsRegressor

In [None]:
pandas_df = df_selected.toPandas()

X = pandas_df.drop(columns=[label_column])
y = pandas_df[label_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn_regressor = KNeighborsRegressor(n_neighbors=5)

knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2_KNR = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE): %.3f" % mse)
print("R-squared (R2): %.3f" % r2_KNR)

Mean Squared Error (MSE): 0.842
R-squared (R2): 0.746


In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn_regressor = KNeighborsRegressor()

grid_search = GridSearchCV(estimator=knn_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Estimator:", best_estimator)

Best Parameters: {'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Best Estimator: KNeighborsRegressor(n_neighbors=3, p=1)


In [None]:
print("LinearRegression")
print("RMSE: %.3f" % rmse)
print("R2: %.3f" % r2_linear)

print("-----------------------------------")
print("RandomForestRegressor")
print("RMSE: %.3f" % rmse)
print("R2: %.3f" % r2_RF)

print("-----------------------------------")
print("KNeighborsRegressor")
print("Mean Squared Error (MSE): %.3f" % mse)
print("R-squared (R2): %.3f" % r2_KNR)


LinearRegression
RMSE: 0.094
R2: 1.000
-----------------------------------
RandomForestRegressor
RMSE: 0.094
R2: 0.998
-----------------------------------
KNeighborsRegressor
Mean Squared Error (MSE): 0.842
R-squared (R2): 0.746
