In [51]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

In [52]:
df = pd.read_csv('data/raw.csv')

In [53]:
df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB


In [55]:
df['Season'].unique()

array(['Whole Year ', 'Kharif     ', 'Rabi       ', 'Autumn     ',
       'Summer     ', 'Winter     '], dtype=object)

In [56]:
df['Season'] = df['Season'].replace({
    'Whole Year ': 'Whole Year',
    'Kharif     ': 'Kharif',
    'Rabi       ': 'Rabi',
    'Autumn     ': 'Autumn',
    'Summer     ': 'Summer',
    'Winter     ': 'Winter'
})

In [57]:
# Drop unnecessary columns
features = df.drop(['Yield', 'Crop_Year'], axis=1)
target = df['Yield']

In [58]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=34)

In [59]:
# Define numerical and categorical features
numerical_features = ['Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
categorical_features = ['Crop', 'Season', 'State']

In [60]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [61]:
# Models to evaluate
models = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor())
]

# Evaluate models using cross-validation and hyperparameter tuning
results = []
for name, model in models:
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # Perform cross-validation with mean squared error and R-squared as scoring metrics
    mse_scorer = make_scorer(mean_squared_error)
    r2_scorer = make_scorer(r2_score)
    cv_results_mse = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=mse_scorer)
    cv_results_r2 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=r2_scorer)
    
    # Store the results for comparison
    results.append({
        'model': name,
        'cv_results_mse': cv_results_mse,
        'mean_cv_mse': np.mean(cv_results_mse),
        'cv_results_r2': cv_results_r2,
        'mean_cv_r2': np.mean(cv_results_r2)
    })

# Display results
for result in results:
    print(f"Model: {result['model']}")
    print(f"Mean Cross-Validation MSE: {result['mean_cv_mse']:.4f}")
    print(f"Mean Cross-Validation R2: {result['mean_cv_r2']:.4f}")
    print("------")

# Identify the best model based on mean cross-validation MSE
best_model = min(results, key=lambda x: x['mean_cv_mse'])
print(f"Best Model: {best_model['model']}")
print(f"Best Model R2: {best_model['mean_cv_r2']:.4f}")

Model: Linear Regression
Mean Cross-Validation MSE: 125920.1939
Mean Cross-Validation R2: 0.8256
------
Model: Decision Tree
Mean Cross-Validation MSE: 51655.1187
Mean Cross-Validation R2: 0.9363
------
Model: Random Forest
Mean Cross-Validation MSE: 32891.3650
Mean Cross-Validation R2: 0.9615
------
Model: Gradient Boosting
Mean Cross-Validation MSE: 33534.4210
Mean Cross-Validation R2: 0.9493
------
Best Model: Random Forest
Best Model R2: 0.9615


In [62]:
# Define the parameter grid for Random Forest
param_grid = {
    'regressor__n_estimators': [50, 100, 150],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Create the Random Forest model with the preprocessor in the pipeline
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_rf_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_predictions = best_rf_model.predict(X_test)
best_mse = mean_squared_error(y_test, best_predictions)

print(f"Best Model (Random Forest) Mean Squared Error on Test Set: {best_mse}")
print("Best Hyperparameters:", grid_search.best_params_)


Best Model (Random Forest) Mean Squared Error on Test Set: 20338.149971812607
Best Hyperparameters: {'regressor__max_depth': 20, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 50}


In [63]:
# Train the best model on the full training dataset
best_rf_model.fit(X_train, y_train)

In [64]:
# Make predictions on the test set using the best model
test_predictions = best_rf_model.predict(X_test)

# Evaluate the best model on the test set
test_mse = mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

print(f"Best Model (Random Forest) Mean Squared Error on Test Set: {test_mse:.4f}")
print(f"Best Model (Random Forest) R-squared on Test Set: {test_r2:.4f}")

Best Model (Random Forest) Mean Squared Error on Test Set: 20903.3153
Best Model (Random Forest) R-squared on Test Set: 0.9759


In [65]:
from joblib import dump
import os

preprocessor_filepath = os.path.join('preprocessor.joblib')
dump(preprocessor, 'preprocessor.joblib')

model_filepath = os.path.join('model.joblib')
dump(best_rf_model, model_filepath)
print(f"Best Model (Random Forest) saved to {model_filepath}")

Best Model (Random Forest) saved to model.joblib


In [67]:
from joblib import load

loaded_rf_model = load('model.joblib')

# Assuming new_data is your new dataset for prediction (replace it with your actual new data)
# Ensure that the new_data has the same features as the training data

test_data = pd.read_csv('data/test.csv')
test_data['Season'] = test_data['Season'].replace({
    'Whole Year ': 'Whole Year',
    'Kharif     ': 'Kharif',
    'Rabi       ': 'Rabi',
    'Autumn     ': 'Autumn',
    'Summer     ': 'Summer',
    'Winter     ': 'Winter'
})

# Make predictions using the loaded model
new_predictions = loaded_rf_model.predict(test_data.head())

# Display the predictions
print("Predicted Crop Yields:")
print(new_predictions)

Predicted Crop Yields:
[6.63050206 0.99068102 2.11431287 1.29342057 0.86387154]


In [68]:
df['Crop'].unique()

array(['Arecanut', 'Arhar/Tur', 'Castor seed', 'Coconut ', 'Cotton(lint)',
       'Dry chillies', 'Gram', 'Jute', 'Linseed', 'Maize', 'Mesta',
       'Niger seed', 'Onion', 'Other  Rabi pulses', 'Potato',
       'Rapeseed &Mustard', 'Rice', 'Sesamum', 'Small millets',
       'Sugarcane', 'Sweet potato', 'Tapioca', 'Tobacco', 'Turmeric',
       'Wheat', 'Bajra', 'Black pepper', 'Cardamom', 'Coriander',
       'Garlic', 'Ginger', 'Groundnut', 'Horse-gram', 'Jowar', 'Ragi',
       'Cashewnut', 'Banana', 'Soyabean', 'Barley', 'Khesari', 'Masoor',
       'Moong(Green Gram)', 'Other Kharif pulses', 'Safflower',
       'Sannhamp', 'Sunflower', 'Urad', 'Peas & beans (Pulses)',
       'other oilseeds', 'Other Cereals', 'Cowpea(Lobia)',
       'Oilseeds total', 'Guar seed', 'Other Summer Pulses', 'Moth'],
      dtype=object)