In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pandas as pd
import numpy as np
import json
import sys
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [17]:
df = pd.read_csv("C:/Users/User/Downloads/crop-recommendation-system-based-on-machine-learning-using-python-master/Data/yield_df.csv")

In [18]:
# Check the number of rows and columns in the dataframe
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

Number of rows: 28242
Number of columns: 6


In [19]:
print(df.columns)

Index(['Area', 'Item', 'hg/ha_yield', 'average_rain_fall_mm_per_year',
       'pesticides_tonnes', 'avg_temp'],
      dtype='object')


In [20]:
df = df.dropna()
df

Unnamed: 0,Area,Item,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,Albania,Maize,36613,1485,121.00,16.37
1,Albania,Potatoes,66667,1485,121.00,16.37
2,Albania,"Rice, paddy",23333,1485,121.00,16.37
3,Albania,Sorghum,12500,1485,121.00,16.37
4,Albania,Soybeans,7000,1485,121.00,16.37
...,...,...,...,...,...,...
28237,Zimbabwe,"Rice, paddy",22581,657,2550.07,19.76
28238,Zimbabwe,Sorghum,3066,657,2550.07,19.76
28239,Zimbabwe,Soybeans,13142,657,2550.07,19.76
28240,Zimbabwe,Sweet potatoes,22222,657,2550.07,19.76


In [21]:
# Separate the features and target variables
X = df.drop(['hg/ha_yield'], axis=1)
y = df['hg/ha_yield']


In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
print(X_train.shape)


(19769, 5)


In [24]:
# Categorical columns for one-hot encoding
categorical_cols = ['Area', 'Item']

In [25]:
# One-hot encode the categorical columns
#ohe = OneHotEncoder(handle_unknown='ignore')
#ohe.fit(X_train[categorical_cols])
# Convert categorical columns to one-hot encoding
X_train_categorical = pd.get_dummies(X_train[categorical_cols])
X_test_categorical = pd.get_dummies(X_test[categorical_cols])
# Combine the one-hot encoded categorical columns and numerical columns
X_train_final = pd.concat([X_train_categorical, X_train.drop(categorical_cols, axis=1)], axis=1)
X_test_final = pd.concat([X_test_categorical, X_test.drop(categorical_cols, axis=1)], axis=1)


In [26]:
#import joblib
#joblib.dump(ohe, 'ohe.joblib')

In [27]:
# Convert categorical columns to one-hot encoding
#X_train_categorical = ohe.transform(X_train[categorical_cols])
#X_test_categorical = ohe.transform(X_test[categorical_cols])

In [28]:
# Combine the one-hot encoded categorical columns and numerical columns
#X_train_final = np.hstack((X_train_categorical.toarray(), X_train.drop(categorical_cols, axis=1)))
#X_test_final = np.hstack((X_test_categorical.toarray(), X_test.drop(categorical_cols, axis=1)))


In [29]:
print(X_train_final.shape)

(19769, 114)


In [30]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
rf = RandomForestRegressor()

# Define the hyperparameter search space for randomized search
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(10, 100),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Perform randomized search with cross-validation
rf_random = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=25, cv=5, n_jobs=-1)
rf_random.fit(X_train_final, y_train)

# Get the best hyperparameter values
best_params = rf_random.best_params_

# Train the random forest model with the best hyperparameter values
rf_best = RandomForestRegressor(**best_params)
rf_best.fit(X_train_final, y_train)

# Make predictions on the test set
y_pred = rf_best.predict(X_test_final)


In [31]:
y_pred

array([71083.10576105, 23796.30167189, 54058.8610119 , ...,
       22702.53960234, 69209.47928529, 23604.22348761])

In [32]:
best_params

{'bootstrap': False,
 'max_depth': 43,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 8,
 'n_estimators': 112}

In [33]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Calculate mean squared error, mean absolute error, and R-squared
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 220568844.24443772
Mean Absolute Error: 6722.620683595852
R-squared: 0.9702422471746419


In [34]:
model_cols = list(X_train_categorical.columns) + list(X_train.select_dtypes(include=['float64', 'int64']).columns)
print(model_cols)

['Area_Albania', 'Area_Algeria', 'Area_Angola', 'Area_Argentina', 'Area_Armenia', 'Area_Australia', 'Area_Austria', 'Area_Azerbaijan', 'Area_Bahamas', 'Area_Bahrain', 'Area_Bangladesh', 'Area_Belarus', 'Area_Belgium', 'Area_Botswana', 'Area_Brazil', 'Area_Bulgaria', 'Area_Burkina Faso', 'Area_Burundi', 'Area_Cameroon', 'Area_Canada', 'Area_Central African Republic', 'Area_Chile', 'Area_Colombia', 'Area_Croatia', 'Area_Denmark', 'Area_Dominican Republic', 'Area_Ecuador', 'Area_Egypt', 'Area_El Salvador', 'Area_Eritrea', 'Area_Estonia', 'Area_Finland', 'Area_France', 'Area_Germany', 'Area_Ghana', 'Area_Greece', 'Area_Guatemala', 'Area_Guinea', 'Area_Guyana', 'Area_Haiti', 'Area_Honduras', 'Area_Hungary', 'Area_India', 'Area_Indonesia', 'Area_Iraq', 'Area_Ireland', 'Area_Italy', 'Area_Jamaica', 'Area_Japan', 'Area_Kazakhstan', 'Area_Kenya', 'Area_Latvia', 'Area_Lebanon', 'Area_Lesotho', 'Area_Libya', 'Area_Lithuania', 'Area_Madagascar', 'Area_Malawi', 'Area_Malaysia', 'Area_Mali', 'Area_M

In [35]:
# Create a sample input
sample_input_dict = {
    'Area': 'India',
    'Item': 'Maize',
    'Area harvested': 1485,
    'Yield': 121,
    'Production': 16.37
}

# Convert the dictionary to a DataFrame
sample_input_df = pd.DataFrame(sample_input_dict, index=[0])

# Convert the categorical columns to one-hot encoding
sample_input_df = pd.get_dummies(sample_input_df, columns=['Area', 'Item'])

# Reorder the columns to match the training data
sample_input_df = sample_input_df.reindex(columns=model_cols, fill_value=0)

# Make the prediction
prediction = rf_random.predict(sample_input_df)

# Print the prediction
print(prediction[0])

40588.885607993216


In [36]:
import joblib

joblib.dump(rf_best, 'RandomForestR.joblib')

['RandomForestR.joblib']