<a href="https://colab.research.google.com/github/Sujanasri24/NPK-Prediction-using-ML/blob/main/Randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Load the dataset
df = pd.read_csv('Crop_recommendation.csv')

# Drop the crop name column as it is not needed for prediction
df.drop('label', axis=1, inplace=True)

In [None]:
# Convert all the features to numerics
df = df.apply(pd.to_numeric)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['N', 'P', 'K'], axis=1), df[['N', 'P', 'K']], test_size=0.25, random_state=42)


In [None]:
# Create a StandardScaler object
scaler = StandardScaler()

# Scale the training data
X_train_scaled = scaler.fit_transform(X_train)

# Scale the test data
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)

# Fit the model to the training data
model.fit(X_train_scaled, y_train)


In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Calculate the mean squared error
mse = np.mean((y_test - y_pred)**2)

# Print the mean squared error
print('Mean squared error:', mse)

Mean squared error: N    438.210653
P    443.593747
K    742.392407
dtype: float64


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [None]:
# Get the unknown input
unknown_input = np.array([25, 60, 7.5, 100])

# Reshape the unknown input
unknown_input = np.reshape(unknown_input, (1, 4))

# Make a prediction using the trained model
prediction = model.predict(unknown_input)

# Print the prediction
print('Predicted NPK values:', prediction)



Predicted NPK values: [[40.99265525 37.89812388 36.79811817]]


In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)

# Calculate the average accuracy
avg_accuracy = np.mean(cv_scores)

# Print the average accuracy
print('Average accuracy:', avg_accuracy)


Average accuracy: 0.6494948029878109


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the range of values for each hyperparameter
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, 20]
}

# Create a RandomForestRegressor object
model = RandomForestRegressor()

# Create a GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
#X_train, X_test, y_train, y_test = train_test_split(df.drop(['N', 'P', 'K'], axis=1), df[['N', 'P', 'K']], test_size=0.25, random_state=42)

# Fit the model to the training data
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Print the best hyperparameters
print('Best hyperparameters:', best_params)


Best hyperparameters: {'max_depth': 20, 'n_estimators': 200}


In [None]:
# Define the Random Forest Regressor
model = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=42)

# Fit the model to the training data
model.fit(X_train_scaled, y_train)


In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Calculate the mean squared error
mse = np.mean((y_test - y_pred)**2)

# Print the mean squared error
print('Mean squared error:', mse)

Mean squared error: N    217.246443
P    222.959679
K    346.481096
dtype: float64


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)

# Calculate the average accuracy
avg_accuracy = np.mean(cv_scores)

# Print the average accuracy
print('Average accuracy:', avg_accuracy)


Average accuracy: 0.8205849470234403


In [None]:
from sklearn.metrics import r2_score

# Calculate the coefficient of determination
r2_score = r2_score(y_test, y_pred)

# Print the coefficient of determination
print('Coefficient of determination:', r2_score)


Coefficient of determination: 0.8253155930991539


In [None]:
from sklearn.metrics import median_absolute_error

# Calculate the median absolute error
mae = median_absolute_error(y_test, y_pred)

# Print the median absolute error
print('Median absolute error:', mae)


Median absolute error: 7.000773809523811


In [None]:
from sklearn.metrics import mean_squared_log_error

# Calculate the root mean squared logarithmic error
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))

# Print the root mean squared logarithmic error
print('Root mean squared logarithmic error:', rmsle)


Root mean squared logarithmic error: 0.46659915608089325


In [None]:
from sklearn.model_selection import cross_val_score

# Calculate the adjusted R-squared using cross-validation
adjusted_r2_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')

# Print the average adjusted R-squared
print('Average adjusted R-squared:', np.mean(adjusted_r2_scores))


Average adjusted R-squared: 0.8205849470234403


In [None]:
from sklearn.metrics import r2_score

# Calculate the multi-output R-squared
multi_output_r2_score = r2_score(y_test, y_pred, multioutput='uniform_average')

# Print the multi-output R-squared
print('Multi-output R-squared:', multi_output_r2_score)


Multi-output R-squared: 0.8253155930991539


In [None]:
import numpy as np

# Remove missing values from the input data
def remove_missing_values(data):
    return data[~np.isnan(data).any(axis=1)]

# Remove missing values from the training data
X_train_clean = remove_missing_values(X_train)
y_train_clean = remove_missing_values(y_train)

# Remove missing values from the test data
X_test_clean = remove_missing_values(X_test)
y_test_clean = remove_missing_values(y_test)


In [None]:
from scipy.stats import spearmanr

# Calculate Spearman's rank correlation coefficient
spearman_r, spearman_p = spearmanr(X_train_clean, y_train_clean)

# Print the Spearman's rank correlation coefficient and p-value
print('Spearman\'s rank correlation coefficient:', spearman_r)
print('Spearman\'s p-value:', spearman_p)


Spearman's rank correlation coefficient: [[ 1.          0.12305039  0.02094176 -0.15422018  0.03409756 -0.14329781
  -0.07140671]
 [ 0.12305039  1.         -0.01391603  0.09504539  0.04986414 -0.32098804
   0.27591986]
 [ 0.02094176 -0.01391603  1.         -0.1741467   0.14214602 -0.13345639
  -0.18278366]
 [-0.15422018  0.09504539 -0.1741467   1.          0.00957292 -0.03581847
   0.06911869]
 [ 0.03409756  0.04986414  0.14214602  0.00957292  1.         -0.16326224
   0.20276502]
 [-0.14329781 -0.32098804 -0.13345639 -0.03581847 -0.16326224  1.
   0.1943813 ]
 [-0.07140671  0.27591986 -0.18278366  0.06911869  0.20276502  0.1943813
   1.        ]]
Spearman's p-value: [[0.00000000e+00 5.34192169e-07 3.95265647e-01 3.02649972e-10
  1.66235293e-01 5.01811127e-09 3.70716163e-03]
 [5.34192169e-07 0.00000000e+00 5.72161769e-01 1.10366774e-04
  4.28452096e-02 7.45287984e-41 3.25194149e-30]
 [3.95265647e-01 5.72161769e-01 0.00000000e+00 1.05599111e-12
  6.66748287e-09 5.29013259e-08 7.30893426

In [None]:
import numpy as np

# Remove duplicate values from the input data
def remove_duplicate_values(data):
    return np.unique(data, axis=0)

# Remove duplicate values from the training data
X_train_clean = remove_duplicate_values(X_train)
y_train_clean = remove_duplicate_values(y_train)

# Remove duplicate values from the test data
X_test_clean = remove_duplicate_values(X_test)
y_test_clean = remove_duplicate_values(y_test)

In [None]:
def resize_arrays(X, y):
    new_shape = (X.shape[0], y.shape[1])
    X_resized = np.resize(X, new_shape)
    y_resized = np.resize(y, new_shape)
    return X_resized, y_resized

# Resize the training data
X_train_resized, y_train_resized = resize_arrays(X_train, y_train)

# Resize the test data
X_test_resized, y_test_resized = resize_arrays(X_test, y_test)

In [None]:
from scipy.stats import kendalltau

# Calculate Kendall's tau rank correlation coefficient
kendall_tau, kendall_p = kendalltau(X_train_resized, y_train_resized)

# Print the Kendall's tau rank correlation coefficient and p-value
print('Kendall\'s tau rank correlation coefficient:', kendall_tau)
print('Kendall\'s p-value:', kendall_p)


Kendall's tau rank correlation coefficient: 0.020517166906105028
Kendall's p-value: 0.03131911984319361
