In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split  
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


In [3]:
df=pd.read_csv("C:/Users/user/Informatics/Project/crop_production.csv")

## Data cleaning 

In [4]:
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172377 entries, 0 to 172376
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     172377 non-null  object 
 1   District_Name  172377 non-null  object 
 2   Crop_Year      172377 non-null  int64  
 3   Season         172377 non-null  object 
 4   Crop           172377 non-null  object 
 5   Area           172377 non-null  float64
 6   Production     172377 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 9.2+ MB


In [6]:
df.isnull().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [7]:
state_counts= df['State_Name'].value_counts()

In [8]:
# removing states with value counts less than 1000
df = df[df['State_Name'].isin(state_counts[state_counts >= 1000].index)]

In [9]:
df['State_Name'].value_counts()

State_Name
Uttar Pradesh        22707
Madhya Pradesh       14834
Karnataka            14236
Assam                14088
Bihar                13643
Andhra Pradesh        9460
Maharashtra           9228
Chhattisgarh          9077
Tamil Nadu            8081
Rajasthan             8076
West Bengal           6338
Gujarat               6255
Odisha                5653
Uttarakhand           3769
Telangana             3380
Nagaland              3367
Haryana               2916
Arunachal Pradesh     2545
Meghalaya             2228
Punjab                2195
Kerala                2012
Himachal Pradesh      1958
Tripura               1240
Manipur               1080
Name: count, dtype: int64

In [10]:
year_counts= df['Crop_Year'].value_counts()

In [11]:
# filtering year with value counts less than 100
df = df[df['Crop_Year'].isin(year_counts[year_counts >= 100].index)]

In [12]:
df['Crop_Year'].value_counts()

Crop_Year
2002    10958
2003    10914
2008     9915
2009     9833
2011     9808
2007     9805
2013     9711
2006     9685
2012     9582
2004     9559
2000     9541
2010     9482
2001     9314
2005     9257
1999     8735
1998     8106
2014     7834
1997     6228
Name: count, dtype: int64

In [13]:
area_counts=df['Area'].value_counts()

In [14]:
# filtering area with value counts less than 12
df = df[df['Area'].isin(area_counts[area_counts >= 12].index)]

In [15]:
df['Area'].value_counts()

Area
1.0       2930
2.0       2712
100.0     2477
3.0       2169
4.0       1888
          ... 
1453.0      12
1277.0      12
1827.0      12
1929.0      12
1684.0      12
Name: count, Length: 1785, dtype: int64

In [16]:
crop_counts=df['Crop'].value_counts()

In [17]:
# filtering crop with value counts less than 1000
df = df[df['Crop'].isin(crop_counts[crop_counts >= 1000].index)]

In [18]:
exclude_crops = ["Other  Rabi pulses", "Other Kharif pulses"]


In [19]:
df = df[~df['Crop'].isin(exclude_crops)]

In [20]:
df['Crop'].value_counts()

Crop
Moong(Green Gram)        6901
Maize                    6765
Sesamum                  5913
Urad                     5724
Dry chillies             5201
Groundnut                4785
Onion                    4729
Sunflower                4369
Arhar/Tur                4323
Gram                     3789
Turmeric                 3697
Sweet potato             3593
Jowar                    3576
Linseed                  3542
Small millets            3467
Rapeseed &Mustard        3325
Rice                     3305
Peas & beans (Pulses)    3297
Coriander                3001
Bajra                    2962
Garlic                   2818
Castor seed              2766
Potato                   2700
Dry ginger               2561
Ragi                     2532
Barley                   2488
Masoor                   2477
Sugarcane                2307
Horse-gram               2273
Tobacco                  2177
Cotton(lint)             2053
Sannhamp                 2030
Soyabean                 1716
Niger

In [21]:
district_counts=df['District_Name'].value_counts()

In [22]:
# filtering district with value counts less than 200
df = df[df['District_Name'].isin(district_counts[district_counts >= 200].index)]

In [23]:
df['District_Name'].value_counts()

District_Name
BILASPUR            536
AURANGABAD          517
SHIMOGA             494
BENGALURU URBAN     492
BANGALORE RURAL     489
                   ... 
SHIVPURI            202
MADURAI             201
DATIA               201
BETUL               200
WEST KHASI HILLS    200
Name: count, Length: 321, dtype: int64

In [24]:
season_counts= df['Season'].value_counts()

In [25]:
# filtering season with value counts less than 1000
df = df[df['Season'].isin(season_counts[season_counts >= 1000].index)]

In [26]:
df['Season'].value_counts()

Season
Kharif         37589
Rabi           26458
Whole Year     18799
Summer          7391
Name: count, dtype: int64

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 90237 entries, 202 to 172375
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   State_Name     90237 non-null  object 
 1   District_Name  90237 non-null  object 
 2   Crop_Year      90237 non-null  int64  
 3   Season         90237 non-null  object 
 4   Crop           90237 non-null  object 
 5   Area           90237 non-null  float64
 6   Production     90237 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 5.5+ MB


In [28]:
df

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
202,Andhra Pradesh,ANANTAPUR,1997,Kharif,Bajra,1400.0,500.0
203,Andhra Pradesh,ANANTAPUR,1997,Kharif,Castor seed,1000.0,100.0
204,Andhra Pradesh,ANANTAPUR,1997,Kharif,Cotton(lint),7300.0,9400.0
205,Andhra Pradesh,ANANTAPUR,1997,Kharif,Dry chillies,3700.0,7100.0
207,Andhra Pradesh,ANANTAPUR,1997,Kharif,Horse-gram,3300.0,1000.0
...,...,...,...,...,...,...,...
172369,West Bengal,PURULIA,2014,Rabi,Peas & beans (Pulses),12.0,12.0
172372,West Bengal,PURULIA,2014,Rabi,Urad,220.0,113.0
172373,West Bengal,PURULIA,2014,Summer,Rice,306.0,801.0
172374,West Bengal,PURULIA,2014,Summer,Sesamum,627.0,463.0


## Encoding

#### converting categorical to numerical

In [29]:
le_State_Name=LabelEncoder()
df['State_Name']=le_State_Name.fit_transform(df['State_Name'])
state_name_mapping = dict(zip(le_State_Name.classes_, le_State_Name.transform(le_State_Name.classes_)))
print("State_Name mapping:", state_name_mapping)

State_Name mapping: {'Andhra Pradesh': 0, 'Assam': 1, 'Bihar': 2, 'Chhattisgarh': 3, 'Gujarat': 4, 'Himachal Pradesh': 5, 'Karnataka': 6, 'Madhya Pradesh': 7, 'Maharashtra': 8, 'Meghalaya': 9, 'Nagaland': 10, 'Rajasthan': 11, 'Tamil Nadu': 12, 'Telangana ': 13, 'Tripura': 14, 'Uttar Pradesh': 15, 'Uttarakhand': 16, 'West Bengal': 17}


In [30]:
le_District_Name=LabelEncoder()
df['District_Name']=le_District_Name.fit_transform(df['District_Name'])
district_name_mapping = dict(zip(le_District_Name.classes_, le_District_Name.transform(le_District_Name.classes_)))
print("District_Name mapping:", district_name_mapping)

District_Name mapping: {'24 PARAGANAS SOUTH': 0, 'ADILABAD': 1, 'AGRA': 2, 'AHMADABAD': 3, 'AHMEDNAGAR': 4, 'ALIGARH': 5, 'ALLAHABAD': 6, 'AMBEDKAR NAGAR': 7, 'AMRAVATI': 8, 'AMRELI': 9, 'AMROHA': 10, 'ANANTAPUR': 11, 'ARARIA': 12, 'ARWAL': 13, 'AURAIYA': 14, 'AURANGABAD': 15, 'AZAMGARH': 16, 'BAGALKOT': 17, 'BAGESHWAR': 18, 'BAGHPAT': 19, 'BAHRAICH': 20, 'BAKSA': 21, 'BALAGHAT': 22, 'BALLIA': 23, 'BALRAMPUR': 24, 'BANDA': 25, 'BANGALORE RURAL': 26, 'BANKA': 27, 'BANKURA': 28, 'BANSWARA': 29, 'BARABANKI': 30, 'BARAN': 31, 'BARDHAMAN': 32, 'BAREILLY': 33, 'BARPETA': 34, 'BARWANI': 35, 'BASTAR': 36, 'BASTI': 37, 'BEED': 38, 'BEGUSARAI': 39, 'BELGAUM': 40, 'BELLARY': 41, 'BENGALURU URBAN': 42, 'BETUL': 43, 'BHAGALPUR': 44, 'BHANDARA': 45, 'BHARATPUR': 46, 'BHARUCH': 47, 'BHAVNAGAR': 48, 'BHOJPUR': 49, 'BHOPAL': 50, 'BIDAR': 51, 'BIJAPUR': 52, 'BIJNOR': 53, 'BILASPUR': 54, 'BIRBHUM': 55, 'BONGAIGAON': 56, 'BUDAUN': 57, 'BULANDSHAHR': 58, 'BUXAR': 59, 'CACHAR': 60, 'CHAMARAJANAGAR': 61, 'CH

In [31]:
le_Season=LabelEncoder()
df['Season']=le_Season.fit_transform(df['Season'])
season_mapping = dict(zip(le_Season.classes_, le_Season.transform(le_Season.classes_)))
print("Season mapping:", season_mapping)

Season mapping: {'Kharif     ': 0, 'Rabi       ': 1, 'Summer     ': 2, 'Whole Year ': 3}


In [32]:
le_Crop=LabelEncoder()
df['Crop']=le_Crop.fit_transform(df['Crop'])
crop_mapping = dict(zip(le_Crop.classes_, le_Crop.transform(le_Crop.classes_)))
print("Crop mapping:", crop_mapping)

Crop mapping: {'Arhar/Tur': 0, 'Bajra': 1, 'Banana': 2, 'Barley': 3, 'Castor seed': 4, 'Coriander': 5, 'Cotton(lint)': 6, 'Dry chillies': 7, 'Dry ginger': 8, 'Garlic': 9, 'Gram': 10, 'Groundnut': 11, 'Horse-gram': 12, 'Jowar': 13, 'Linseed': 14, 'Maize': 15, 'Masoor': 16, 'Mesta': 17, 'Moong(Green Gram)': 18, 'Niger seed': 19, 'Onion': 20, 'Peas & beans (Pulses)': 21, 'Potato': 22, 'Ragi': 23, 'Rapeseed &Mustard': 24, 'Rice': 25, 'Sannhamp': 26, 'Sesamum': 27, 'Small millets': 28, 'Soyabean': 29, 'Sugarcane': 30, 'Sunflower': 31, 'Sweet potato': 32, 'Tobacco': 33, 'Turmeric': 34, 'Urad': 35, 'Wheat': 36}


In [33]:
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
202,0,11,1997,0,1,1400.0,500.0
203,0,11,1997,0,4,1000.0,100.0
204,0,11,1997,0,6,7300.0,9400.0
205,0,11,1997,0,7,3700.0,7100.0
207,0,11,1997,0,12,3300.0,1000.0


In [34]:
# Saving label encoders
label_encoders = {
    'le_State_Name': le_State_Name,
    'le_District_Name': le_District_Name,
    'le_Season': le_Season,
    'le_Crop': le_Crop
}

for encoder_name, encoder in label_encoders.items():
    joblib.dump(encoder, f"{encoder_name}.joblib")

### Modeling

In [35]:

x = df.drop(['Production'], axis=1)
y = df['Production']


In [36]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [37]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)


In [38]:
# Saving the model to a file
joblib.dump(model, "crop_production_model.joblib")

['crop_production_model.joblib']

In [39]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2}')

R^2 Score: 0.8105704942933495


In [40]:
# New data point
x_pred = np.array([["Andhra Pradesh", "KADAPA", 2023, "Kharif     ", "Sunflower", 56100]])

# Applying label encoding to the new data
x_pred_encoded = x_pred.copy()
x_pred_encoded[:, 0] = le_State_Name.transform(x_pred[:, 0])
x_pred_encoded[:, 1] = le_District_Name.transform(x_pred[:, 1])
x_pred_encoded[:, 3] = le_Season.transform(x_pred[:, 3])
x_pred_encoded[:, 4] = le_Crop.transform(x_pred[:, 4])

# Converting the encoded data to numeric values
x_pred_encoded = x_pred_encoded.astype(float)

# Making predictions using the trained model
y_pred = model.predict(x_pred_encoded)

print(f'Predicted Production: {y_pred[0]}')


Predicted Production: 52971.12




In [None]:
# New data point
x_pred = np.array([["Andhra Pradesh", "KADAPA", 2023, "Kharif     ", "Sunflower", 56100]])

# Applying label encoding to the new data
x_pred_encoded = x_pred.copy()
x_pred_encoded[:, 0] = le_State_Name.transform(x_pred[:, 0])
x_pred_encoded[:, 1] = le_District_Name.transform(x_pred[:, 1])
x_pred_encoded[:, 3] = le_Season.transform(x_pred[:, 3])
x_pred_encoded[:, 4] = le_Crop.transform(x_pred[:, 4])

# Converting the encoded data to numeric values
x_pred_encoded = x_pred_encoded.astype(float)

# Making predictions using the trained model
y_pred = model.predict(x_pred_encoded)

print(f'Predicted Production: {y_pred[0]}')
