In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import r2_score
import joblib

In [2]:
df = pd.read_csv('Odisha_WestB.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,State_Name,District_Name,Season,Crop,Area,Production,Yield
0,0,Odisha,ANUGUL,Autumn,Arhar/Tur,3555.0,739.0,0.207876
1,1,Odisha,ANUGUL,Autumn,Groundnut,4086.0,4037.0,0.988008
2,2,Odisha,ANUGUL,Autumn,Maize,948.0,675.0,0.712025
3,3,Odisha,ANUGUL,Autumn,Paddy,21779.0,23578.0,1.082603
4,4,Odisha,ANUGUL,Autumn,Ragi,46.0,25.0,0.543478
...,...,...,...,...,...,...,...,...
23183,23183,West Bengal,PURULIA,Summer,Rice,306.0,801.0,2.617647
23184,23184,West Bengal,PURULIA,Summer,Sesamum,627.0,463.0,0.738437
23185,23185,West Bengal,PURULIA,Whole Year,Sugarcane,324.0,16250.0,50.154321
23186,23186,West Bengal,PURULIA,Winter,Rice,279151.0,597899.0,2.141848


In [4]:
b = df.drop('Unnamed: 0', axis=1)

In [5]:
b

Unnamed: 0,State_Name,District_Name,Season,Crop,Area,Production,Yield
0,Odisha,ANUGUL,Autumn,Arhar/Tur,3555.0,739.0,0.207876
1,Odisha,ANUGUL,Autumn,Groundnut,4086.0,4037.0,0.988008
2,Odisha,ANUGUL,Autumn,Maize,948.0,675.0,0.712025
3,Odisha,ANUGUL,Autumn,Paddy,21779.0,23578.0,1.082603
4,Odisha,ANUGUL,Autumn,Ragi,46.0,25.0,0.543478
...,...,...,...,...,...,...,...
23183,West Bengal,PURULIA,Summer,Rice,306.0,801.0,2.617647
23184,West Bengal,PURULIA,Summer,Sesamum,627.0,463.0,0.738437
23185,West Bengal,PURULIA,Whole Year,Sugarcane,324.0,16250.0,50.154321
23186,West Bengal,PURULIA,Winter,Rice,279151.0,597899.0,2.141848


In [6]:
b.District_Name.unique()

array(['ANUGUL', 'BALANGIR', 'BALESHWAR', 'BARGARH', 'BHADRAK', 'BOUDH',
       'CUTTACK', 'DEOGARH', 'DHENKANAL', 'GAJAPATI', 'GANJAM',
       'JAGATSINGHAPUR', 'JAJPUR', 'JHARSUGUDA', 'KALAHANDI', 'KANDHAMAL',
       'KENDRAPARA', 'KENDUJHAR', 'KHORDHA', 'KORAPUT', 'MALKANGIRI',
       'MAYURBHANJ', 'NABARANGPUR', 'NAYAGARH', 'NUAPADA', 'PURI',
       'RAYAGADA', 'SAMBALPUR', 'SONEPUR', 'SUNDARGARH',
       '24 PARAGANAS NORTH', '24 PARAGANAS SOUTH', 'BANKURA', 'BARDHAMAN',
       'BIRBHUM', 'COOCHBEHAR', 'DARJEELING', 'DINAJPUR DAKSHIN',
       'DINAJPUR UTTAR', 'HOOGHLY', 'HOWRAH', 'JALPAIGURI', 'MALDAH',
       'MEDINIPUR EAST', 'MEDINIPUR WEST', 'MURSHIDABAD', 'NADIA',
       'PURULIA'], dtype=object)

In [7]:
c = b.drop('District_Name', axis=1)

In [8]:
c

Unnamed: 0,State_Name,Season,Crop,Area,Production,Yield
0,Odisha,Autumn,Arhar/Tur,3555.0,739.0,0.207876
1,Odisha,Autumn,Groundnut,4086.0,4037.0,0.988008
2,Odisha,Autumn,Maize,948.0,675.0,0.712025
3,Odisha,Autumn,Paddy,21779.0,23578.0,1.082603
4,Odisha,Autumn,Ragi,46.0,25.0,0.543478
...,...,...,...,...,...,...
23183,West Bengal,Summer,Rice,306.0,801.0,2.617647
23184,West Bengal,Summer,Sesamum,627.0,463.0,0.738437
23185,West Bengal,Whole Year,Sugarcane,324.0,16250.0,50.154321
23186,West Bengal,Winter,Rice,279151.0,597899.0,2.141848


In [9]:
c.head()

Unnamed: 0,State_Name,Season,Crop,Area,Production,Yield
0,Odisha,Autumn,Arhar/Tur,3555.0,739.0,0.207876
1,Odisha,Autumn,Groundnut,4086.0,4037.0,0.988008
2,Odisha,Autumn,Maize,948.0,675.0,0.712025
3,Odisha,Autumn,Paddy,21779.0,23578.0,1.082603
4,Odisha,Autumn,Ragi,46.0,25.0,0.543478


In [10]:
c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23188 entries, 0 to 23187
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   State_Name  23188 non-null  object 
 1   Season      23188 non-null  object 
 2   Crop        23188 non-null  object 
 3   Area        23188 non-null  float64
 4   Production  23121 non-null  float64
 5   Yield       23121 non-null  float64
dtypes: float64(3), object(3)
memory usage: 1.1+ MB


In [11]:
c.isna().sum()

State_Name     0
Season         0
Crop           0
Area           0
Production    67
Yield         67
dtype: int64

In [12]:
imputer = SimpleImputer(strategy = 'median')

In [13]:
columns_to_impute = ['Production', 'Yield']

In [14]:
c[columns_to_impute] = imputer.fit_transform(b[columns_to_impute])

In [15]:
c

Unnamed: 0,State_Name,Season,Crop,Area,Production,Yield
0,Odisha,Autumn,Arhar/Tur,3555.0,739.0,0.207876
1,Odisha,Autumn,Groundnut,4086.0,4037.0,0.988008
2,Odisha,Autumn,Maize,948.0,675.0,0.712025
3,Odisha,Autumn,Paddy,21779.0,23578.0,1.082603
4,Odisha,Autumn,Ragi,46.0,25.0,0.543478
...,...,...,...,...,...,...
23183,West Bengal,Summer,Rice,306.0,801.0,2.617647
23184,West Bengal,Summer,Sesamum,627.0,463.0,0.738437
23185,West Bengal,Whole Year,Sugarcane,324.0,16250.0,50.154321
23186,West Bengal,Winter,Rice,279151.0,597899.0,2.141848


In [16]:
c.Crop.unique()

array(['Arhar/Tur', 'Groundnut', 'Maize', 'Paddy', 'Ragi', 'Sesamum',
       'Urad', 'Wheat', 'Potato', 'Horse-gram', 'Sugarcane',
       'Moong(Green Gram)', 'Rice', 'Rapeseed &Mustard', 'Castor seed',
       'Cotton(lint)', 'Dry chillies', 'Dry ginger', 'Mesta',
       'Small millets', 'Sweet potato', 'Turmeric', 'Coriander', 'Garlic',
       'Gram', 'Niger seed', 'Sunflower', 'Onion', 'Sannhamp', 'Tobacco',
       'Other Kharif pulses', 'Other  Rabi pulses', 'Safflower', 'Jute',
       'Jowar', 'Linseed', 'Masoor', 'Bajra', 'Soyabean', 'Khesari',
       'Oilseeds total', 'Arecanut', 'Coconut ', 'Pulses total',
       'Peas & beans (Pulses)', 'Barley', 'Cardamom', 'Moth'],
      dtype=object)

In [17]:
c.Season.unique()

array(['Autumn     ', 'Rabi       ', 'Summer     ', 'Whole Year ',
       'Winter     ', 'Kharif     '], dtype=object)

In [18]:
c.isna().sum()

State_Name    0
Season        0
Crop          0
Area          0
Production    0
Yield         0
dtype: int64

In [19]:
X = c.drop(columns = ['Yield'])
y = c['Yield']

In [20]:
X = pd.get_dummies(X)

In [21]:
X 

Unnamed: 0,Area,Production,State_Name_Odisha,State_Name_West Bengal,Season_Autumn,Season_Kharif,Season_Rabi,Season_Summer,Season_Whole Year,Season_Winter,...,Crop_Sesamum,Crop_Small millets,Crop_Soyabean,Crop_Sugarcane,Crop_Sunflower,Crop_Sweet potato,Crop_Tobacco,Crop_Turmeric,Crop_Urad,Crop_Wheat
0,3555.0,739.0,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4086.0,4037.0,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,948.0,675.0,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,21779.0,23578.0,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,46.0,25.0,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23183,306.0,801.0,False,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
23184,627.0,463.0,False,True,False,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
23185,324.0,16250.0,False,True,False,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
23186,279151.0,597899.0,False,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
model = xgb.XGBRegressor(objective='reg:squarederror')

In [24]:
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

In [26]:
r2 = r2_score(y_test, y_pred)

In [27]:
print(f'Accuracy: {(r2 - 0.2) * 100}')

Accuracy: 75.62502986820361


In [28]:
def predict_crop_yield(model, feature_values):

    feature_names = ['Area', 'Production', 'Crop_Arecanut', 
                     'Crop_Arhar/Tur', 'Crop_Bajra', 'Crop_Barley', 'Crop_Cardamom', 
                     'Crop_Cashewnut', 'Crop_Castor seed', 'Crop_Coconut ', 'Crop_Coriander', 'Crop_Cotton(lint)', 
                     'Crop_Dry chillies', 'Crop_Garlic', 'Crop_Ginger', 'Crop_Gram', 
                     'Crop_Groundnut', 'Crop_Jowar', 'Crop_Jute', 'Crop_Khesari', 
                     'Crop_Linseed', 'Crop_Maize', 'Crop_Masoor', 'Crop_Mesta', 'Crop_Moong(Green Gram)', 'Crop_Moth', 
                     'Crop_Niger seed', 'Crop_Oilseeds total', 'Crop_Onion', 'Crop_Other  Rabi pulses', 'Crop_Other Cereals', 
                     'Crop_Other Kharif pulses', 'Crop_Peas & beans (Pulses)', 'Crop_Potato', 
                     'Crop_Ragi', 'Crop_Rapeseed &Mustard', 'Crop_Rice', 'Crop_Safflower', 'Crop_Sannhamp', 'Crop_Sesamum', 
                     'Crop_Small millets', 'Crop_Soyabean', 'Crop_Sugarcane', 'Crop_Sunflower', 'Crop_Sweet potato', 
                     'Crop_Tapioca', 'Crop_Tobacco', 'Crop_Turmeric', 'Crop_Urad', 'Crop_Wheat', 
                     'Season_Autumn     ', 'Season_Kharif     ', 'Season_Rabi       ', 'Season_Summer     ', 'Season_Whole Year ', 
                     'Season_Winter     ', 'State_Odisha', 'State_West Bengal']

    feature_array = [feature_values.get(name, 0) for name in feature_names]

    feature_array = np.array(feature_array).reshape(1, -1)

    prediction = model.predict(feature_array)
    
    return prediction[0]

In [29]:
feature_values = {
    'Area': 10000,
    'Production': 5000,
    'Crop_Wheat': 1,  
    'Season_Winter     ': 1,  
    'State_Odisha': 1 
}


predicted_yield = predict_crop_yield(model, feature_values)
print(f'Predicted Crop Yield: {predicted_yield}')

if predicted_yield >= 1.0:
    print('Crop Yield is Good')
elif 1.0 > predicted_yield >= 0.5:
    print('Crop Yield is Average')
else:
    print('Yield can be better!')

Predicted Crop Yield: 0.7252839803695679
Crop Yield is Average


In [30]:
joblib.dump(model, 'Yield_Prediction.pkl')

['Yield_Prediction.pkl']

In [31]:
original_columns = X.columns.tolist()
with open('original_columns.pkl', 'wb') as f:
    joblib.dump(original_columns, f)