In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

data = pd.read_csv("crop_yield.csv")
data.head()


Unnamed: 0,Crop,Crop_Year,Season,State,Area,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,2051.4,165500.63,539.09,0.420909


In [2]:
data.shape

(19689, 9)

In [3]:
data.duplicated().sum()

0

In [4]:
data.isnull().sum()

Crop               0
Crop_Year          0
Season             0
State              0
Area               0
Annual_Rainfall    0
Fertilizer         0
Pesticide          0
Yield              0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Annual_Rainfall  19689 non-null  float64
 6   Fertilizer       19689 non-null  float64
 7   Pesticide        19689 non-null  float64
 8   Yield            19689 non-null  float64
dtypes: float64(5), int64(1), object(3)
memory usage: 1.4+ MB


In [6]:
State = data['State'].unique()
yield_per_State = []
for st in State:
    yield_per_State.append(data[data['State']==st]['Yield'].sum())

In [7]:
data['Yield'].sum()

1574214.485762277

In [8]:
col = ['Crop','Crop_Year','Season','State','Area','Annual_Rainfall','Fertilizer','Pesticide','Yield']
data = data[col]
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0, shuffle=True)

In [10]:
X_train.head(1)

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Annual_Rainfall,Fertilizer,Pesticide
1554,Other Rabi pulses,2004,Rabi,Karnataka,23981.0,1117.7,2598101.54,5036.01


In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
ohe = OneHotEncoder(handle_unknown='ignore')
scale = StandardScaler()

preprocesser = ColumnTransformer(
        transformers = [
            ('StandardScale', scale, [1, 4, 5, 6, 7]),
            ('OHE', ohe, [0, 2, 3]),
        ],
        remainder='passthrough'
)

In [12]:
X_train1 = preprocesser.fit_transform(X_train)
X_test1 = preprocesser.transform(X_test)

In [13]:
preprocesser.get_feature_names_out(col[:-1])

array(['StandardScale__Crop_Year', 'StandardScale__Area',
       'StandardScale__Annual_Rainfall', 'StandardScale__Fertilizer',
       'StandardScale__Pesticide', 'OHE__Crop_Arecanut',
       'OHE__Crop_Arhar/Tur', 'OHE__Crop_Bajra', 'OHE__Crop_Banana',
       'OHE__Crop_Barley', 'OHE__Crop_Black pepper', 'OHE__Crop_Cardamom',
       'OHE__Crop_Cashewnut', 'OHE__Crop_Castor seed',
       'OHE__Crop_Coconut ', 'OHE__Crop_Coriander',
       'OHE__Crop_Cotton(lint)', 'OHE__Crop_Cowpea(Lobia)',
       'OHE__Crop_Dry chillies', 'OHE__Crop_Garlic', 'OHE__Crop_Ginger',
       'OHE__Crop_Gram', 'OHE__Crop_Groundnut', 'OHE__Crop_Guar seed',
       'OHE__Crop_Horse-gram', 'OHE__Crop_Jowar', 'OHE__Crop_Jute',
       'OHE__Crop_Khesari', 'OHE__Crop_Linseed', 'OHE__Crop_Maize',
       'OHE__Crop_Masoor', 'OHE__Crop_Mesta',
       'OHE__Crop_Moong(Green Gram)', 'OHE__Crop_Moth',
       'OHE__Crop_Niger seed', 'OHE__Crop_Oilseeds total',
       'OHE__Crop_Onion', 'OHE__Crop_Other  Rabi pulses',
     

In [14]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,r2_score


models = {
    'lr':LinearRegression(),
    'lss':Lasso(),
    'Rid':Ridge(),
    'Dtr':DecisionTreeRegressor()
}
for name, md in models.items():
    md.fit(X_train1,y_train)
    y_pred = md.predict(X_test1)
    
    print(f"{name} : mean_ab_err : {mean_absolute_error(y_test,y_pred)} score : {r2_score(y_test,y_pred)}")

lr : mean_ab_err : 58.127035195428405 score : 0.8469334337119425
lss : mean_ab_err : 47.907274791183895 score : 0.844255159182976
Rid : mean_ab_err : 58.45077835908863 score : 0.8459004611167156
Dtr : mean_ab_err : 7.4450039461709 score : 0.9716294260529328


In [15]:
# since in DecisionTreeRegressor we have hi accuracy and low mean absolute error we choose that

dtr = DecisionTreeRegressor()
dtr.fit(X_train1,y_train)
dtr.predict(X_test1)

array([38.00357143,  5.51481482,  0.54615385, ...,  3.32695652,
        0.75      ,  1.85538461])

In [16]:
def prediction(Crop,Crop_Year,Season,State,Area,Annual_Rainfall,Fertilizer,Pesticide):
    
    features = np.array([[Crop,Crop_Year,Season,State,Area,Annual_Rainfall,Fertilizer,Pesticide]],dtype=object)
    trans = preprocesser.transform(features)
    predicted_yeild=dtr.predict(trans)
    
    return predicted_yeild[0]

In [19]:

Crop,Crop_Year,Season,State,Area,Annual_Rainfall,Fertilizer,Pesticide='Coconut',20234,'Whole Year','Kerala',73814.0,2051.4,7024878.38,22882.34
res = prediction(Crop,Crop_Year,Season,State,Area,Annual_Rainfall,Fertilizer,Pesticide)
print(res*10,"quintal's per hector")

7.47619048 quintal's per hector




In [18]:
import pickle
pickle.dump(dtr,open('dtr.pkl','wb'))
pickle.dump(preprocesser,open('preprocessor.pkl','wb'))