In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
# load dataset
data = pd.read_csv('yield_df.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37
4,4,Albania,Soybeans,1990,7000,1485.0,121.0,16.37


In [3]:
# remove the unnamed column
data.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
data.head()

Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37
4,Albania,Soybeans,1990,7000,1485.0,121.0,16.37


In [5]:
data['Area'].unique()

array(['Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Belarus', 'Belgium', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cameroon', 'Canada',
       'Central African Republic', 'Chile', 'Colombia', 'Croatia',
       'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Eritrea', 'Estonia', 'Finland', 'France', 'Germany', 'Ghana',
       'Greece', 'Guatemala', 'Guinea', 'Guyana', 'Haiti', 'Honduras',
       'Hungary', 'India', 'Indonesia', 'Iraq', 'Ireland', 'Italy',
       'Jamaica', 'Japan', 'Kazakhstan', 'Kenya', 'Latvia', 'Lebanon',
       'Lesotho', 'Libya', 'Lithuania', 'Madagascar', 'Malawi',
       'Malaysia', 'Mali', 'Mauritania', 'Mauritius', 'Mexico',
       'Montenegro', 'Morocco', 'Mozambique', 'Namibia', 'Nepal',
       'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Norway',
       'Pakistan', 'Papua New Guinea', 'Peru', 'Pola

In [6]:
data['Item'].unique()

array(['Maize', 'Potatoes', 'Rice, paddy', 'Sorghum', 'Soybeans', 'Wheat',
       'Cassava', 'Sweet potatoes', 'Plantains and others', 'Yams'],
      dtype=object)

In [7]:
# shape
data.shape

(28242, 7)

In [8]:
data.isna().sum()

Area                             0
Item                             0
Year                             0
hg/ha_yield                      0
average_rain_fall_mm_per_year    0
pesticides_tonnes                0
avg_temp                         0
dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28242 entries, 0 to 28241
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Area                           28242 non-null  object 
 1   Item                           28242 non-null  object 
 2   Year                           28242 non-null  int64  
 3   hg/ha_yield                    28242 non-null  int64  
 4   average_rain_fall_mm_per_year  28242 non-null  float64
 5   pesticides_tonnes              28242 non-null  float64
 6   avg_temp                       28242 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 1.5+ MB


In [10]:
data.describe()

Unnamed: 0,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
count,28242.0,28242.0,28242.0,28242.0,28242.0
mean,2001.544296,77053.332094,1149.05598,37076.909344,20.542627
std,7.051905,84956.612897,709.81215,59958.784665,6.312051
min,1990.0,50.0,51.0,0.04,1.3
25%,1995.0,19919.25,593.0,1702.0,16.7025
50%,2001.0,38295.0,1083.0,17529.44,21.51
75%,2008.0,104676.75,1668.0,48687.88,26.0
max,2013.0,501412.0,3240.0,367778.0,30.65


In [11]:
# check for duplicate
data.duplicated().sum()

np.int64(2310)

In [12]:
# drop the duplicates
data.drop_duplicates(keep='first',inplace=True)

In [13]:
data.shape

(25932, 7)

In [14]:
data['average_rain_fall_mm_per_year'].unique()

array([1485.,   89., 1010.,  591.,  562.,  534., 1110.,  447., 1292.,
         83., 2666.,  618.,  847.,  416., 1761.,  608.,  748., 1274.,
       1604.,  537., 1342., 1522., 3240., 1113.,  703., 1410., 2274.,
         51., 1784.,  383.,  626.,  536.,  867.,  700., 1187.,  652.,
       1996., 1651., 2387., 1440., 1976.,  589., 1083., 2702.,  216.,
       1118.,  832., 2051., 1668.,  250.,  630.,  641.,  661.,  788.,
         56.,  656., 1513., 1181., 2875.,  282.,   92., 2041.,  758.,
        241.,  346., 1032.,  285., 1500.,  778., 1732., 2280.,  151.,
       1414.,  494., 3142., 1738.,  600.,  854.,   74.,  637., 1212.,
         59.,  686., 1162.,  495.,  636., 1712., 2331.,  624., 1537.,
        691., 1622.,  207.,  593., 1180.,  565., 1220., 1300., 1020.,
        657.])

In [15]:
data['average_rain_fall_mm_per_year'].dtype

dtype('float64')

In [16]:
def isStr(obj):
    try:
        float(obj)
        return False
    except:
        return True

In [17]:
to_drop = data[data['average_rain_fall_mm_per_year'].apply(isStr)].index

In [18]:
to_drop

Index([], dtype='int64')

In [19]:
data = data.drop(to_drop,axis=0)

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25932 entries, 0 to 28241
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Area                           25932 non-null  object 
 1   Item                           25932 non-null  object 
 2   Year                           25932 non-null  int64  
 3   hg/ha_yield                    25932 non-null  int64  
 4   average_rain_fall_mm_per_year  25932 non-null  float64
 5   pesticides_tonnes              25932 non-null  float64
 6   avg_temp                       25932 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 1.6+ MB


In [21]:
fig = px.histogram(
    data,
    y='Area'
)
fig.update_layout(
    height=1280,
)
fig.show()

In [22]:
# yield per country
yeild_country = data.pivot_table(
    values='hg/ha_yield',
    index='Area',
    aggfunc='sum'
).sort_values(by='hg/ha_yield',ascending=False)
yeild_country

Unnamed: 0_level_0,hg/ha_yield
Area,Unnamed: 1_level_1
India,274219558
Brazil,136340329
Mexico,113698872
Australia,109111062
Japan,100924145
...,...
Namibia,3165475
Sudan,1896346
Montenegro,1645100
Eritrea,1452416


In [23]:
# rearrange the column names
col = ['Year','average_rain_fall_mm_per_year','pesticides_tonnes','avg_temp','Area','Item','hg/ha_yield']
data = data[col]

In [24]:
data.head()

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item,hg/ha_yield
0,1990,1485.0,121.0,16.37,Albania,Maize,36613
1,1990,1485.0,121.0,16.37,Albania,Potatoes,66667
2,1990,1485.0,121.0,16.37,Albania,"Rice, paddy",23333
3,1990,1485.0,121.0,16.37,Albania,Sorghum,12500
4,1990,1485.0,121.0,16.37,Albania,Soybeans,7000


In [25]:
data.to_csv('yield_data.csv',index=True)

In [26]:
data.head(1)

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item,hg/ha_yield
0,1990,1485.0,121.0,16.37,Albania,Maize,36613


### Train/Test Split

In [27]:
X = data.drop('hg/ha_yield',axis=1)
y = data['hg/ha_yield']

In [28]:
X

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item
0,1990,1485.0,121.00,16.37,Albania,Maize
1,1990,1485.0,121.00,16.37,Albania,Potatoes
2,1990,1485.0,121.00,16.37,Albania,"Rice, paddy"
3,1990,1485.0,121.00,16.37,Albania,Sorghum
4,1990,1485.0,121.00,16.37,Albania,Soybeans
...,...,...,...,...,...,...
28237,2013,657.0,2550.07,19.76,Zimbabwe,"Rice, paddy"
28238,2013,657.0,2550.07,19.76,Zimbabwe,Sorghum
28239,2013,657.0,2550.07,19.76,Zimbabwe,Soybeans
28240,2013,657.0,2550.07,19.76,Zimbabwe,Sweet potatoes


In [29]:
y

0        36613
1        66667
2        23333
3        12500
4         7000
         ...  
28237    22581
28238     3066
28239    13142
28240    22222
28241    22888
Name: hg/ha_yield, Length: 25932, dtype: int64

In [30]:
# model to split
from sklearn.model_selection import train_test_split

In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [32]:
X_train

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item
24234,2000,59.0,3024.11,26.55,Saudi Arabia,Sorghum
9468,2012,652.0,8002.20,18.82,Greece,Sweet potatoes
6793,2006,3240.0,98328.63,27.51,Colombia,Maize
28212,2010,657.0,3305.17,21.17,Zimbabwe,Potatoes
7358,2007,1410.0,5689.80,27.08,Dominican Republic,Sweet potatoes
...,...,...,...,...,...,...
23678,2004,854.0,16942.00,16.31,Portugal,Sweet potatoes
5960,2006,537.0,36572.75,7.85,Canada,Wheat
860,1991,534.0,17866.00,18.73,Australia,Potatoes
17223,1998,250.0,6416.14,6.94,Kazakhstan,Potatoes


### Converting Categorical to Numerical & Scaling the values

In [33]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [34]:
ohe = OneHotEncoder(drop='first')
scaler = StandardScaler()

In [35]:
X_train.head(1)

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item
24234,2000,59.0,3024.11,26.55,Saudi Arabia,Sorghum


In [36]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehotencoder',ohe,[4,5]),
        ('scaler',scaler,[0,1,2,3]),
    ],
    remainder='passthrough'
)

In [37]:
preprocessor

In [38]:
X_train_dummy = preprocessor.fit_transform(X_train)
X_test_dummy = preprocessor.transform(X_test)

In [39]:
X_train_dummy

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 122877 stored elements and shape (20745, 113)>

In [40]:
X_test_dummy

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 30727 stored elements and shape (5187, 113)>

### Train the models

In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor


### Hyperparameter Tuning

In [42]:
# import hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [43]:
# setting parameters for the models
params = {
    'kernel':['linear','poly','rbf','sigmoid'],
    'C':[1,10,100,1000],
}

In [44]:
# initialize the model
lr = LinearRegression()
ls = Lasso()
rd = Ridge()
en = ElasticNet()
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
sv = SVR()
dr = DecisionTreeRegressor()
kn = KNeighborsRegressor()


In [45]:
# Parameter grid
params = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}

# GridSearchCV
grid_search = GridSearchCV(lr, param_grid=params, cv=5, verbose=1)
grid_search.fit(X_train_dummy, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'fit_intercept': False, 'positive': False}




10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\linear_model\_base.py", line 601, in fit
    X, y = validate_data(
          

In [46]:
X_train_dummy

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 122877 stored elements and shape (20745, 113)>

In [47]:
y_train

24234     13384
9468     199034
6793      25368
28212    166825
7358      75696
          ...  
23678     81356
5960      26096
860      285359
17223     76679
25918    135039
Name: hg/ha_yield, Length: 20745, dtype: int64

In [48]:
# laso
# Parameter grid
params = {
    'alpha': [0.01, 0.1, 1, 10],
    'max_iter': [1000, 2000],
    'selection': ['cyclic', 'random']
}

# GridSearchCV
grid_search = GridSearchCV(ls, param_grid=params, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


ValueError: 
All the 80 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 982, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Uganda'

--------------------------------------------------------------------------------
64 fits failed with the following error:
Traceback (most recent call last):
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py", line 982, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\2025\Data Science Courses\GDAP ML\Crop Yield\.venv\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Saudi Arabia'


In [None]:
lr = LinearRegression()
lr.fit(X_train_dummy,y_train)

ls = Lasso()
ls.fit(X_train_dummy,y_train)

rd = Ridge()
rd.fit(X_train_dummy,y_train)

en = ElasticNet()
en.fit(X_train_dummy,y_train)

rf = RandomForestRegressor()
rf.fit(X_train_dummy,y_train)

gb = GradientBoostingRegressor()
gb.fit(X_train_dummy,y_train)

sv = SVR()
sv.fit(X_train_dummy,y_train)

dr = DecisionTreeRegressor()
dr.fit(X_train_dummy,y_train)

kn = KNeighborsRegressor()
kn.fit(X_train_dummy,y_train)



Objective did not converge. You might want to increase the number of iterations. Duality gap: 43166910886.81641, tolerance: 14819338848.544127



Evaluate

In [None]:
y_pred1 = lr.predict(X_test_dummy)
y_pred2 = ls.predict(X_test_dummy)
y_pred3 = rd.predict(X_test_dummy)
y_pred4 = en.predict(X_test_dummy)
y_pred5 = rf.predict(X_test_dummy)
y_pred6 = gb.predict(X_test_dummy)
y_pred7 = sv.predict(X_test_dummy)
y_pred8 = dr.predict(X_test_dummy)
y_pred9 = kn.predict(X_test_dummy)

In [None]:
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
score1 = r2_score(y_test,y_pred1)
score2 = r2_score(y_test,y_pred2)
score3 = r2_score(y_test,y_pred3)
score4 = r2_score(y_test,y_pred4)
score5 = r2_score(y_test,y_pred5)
score6 = r2_score(y_test,y_pred6)
score7 = r2_score(y_test,y_pred7)
score8 = r2_score(y_test,y_pred8)
score9 = r2_score(y_test,y_pred9)

In [None]:
score1,score2,score3,score4,score5,score6,score7,score8,score9

(0.7486565648695502,
 0.7485842229351424,
 0.748542005215551,
 0.23849511353135855,
 0.9846017299378411,
 0.7591769185465316,
 -0.2099498636919488,
 0.9767161772641906,
 0.9823106927202787)

In [None]:
# visualize the results
final_result = pd.DataFrame({
    'Model':['Linear Regression','Lasso','Ridge','ElasticNet','RandomForest','GradientBoosting','SVR','DecisionTree','KNeighbors'],
    'R2 Score':[score1,score2,score3,score4,score5,score6,score7,score8,score9]
})

final_result

Unnamed: 0,Model,R2 Score
0,Linear Regression,0.748657
1,Lasso,0.748584
2,Ridge,0.748542
3,ElasticNet,0.238495
4,RandomForest,0.984602
5,GradientBoosting,0.759177
6,SVR,-0.20995
7,DecisionTree,0.976716
8,KNeighbors,0.982311


In [None]:
fig = px.bar(
    final_result,
    y='Model',
    x='R2 Score',
    color='Model',
    text_auto=',.0%',
    title='R2 Score of Different Models'
)
fig.show()

## Model Selection

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train_dummy,y_train)
rf.predict(X_test_dummy)

array([127044.69,  15816.69,  73112.97, ...,  48499.33,   9626.04,
       134646.58], shape=(5187,))

In [None]:
X_train.head(1)

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item
24234,2000,59.0,3024.11,26.55,Saudi Arabia,Sorghum


In [None]:
Year = 2000
average_rain_fall_mm_per_year = 59.0
pesticides_tonnes = 3024.11
avg_temp = 26.55
Area = 'Saudi Arabia'
Item = 'Sorghum'

input_feature = np.array([[Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item]])

transformed_features = preprocessor.transform(input_feature).reshape(1,-1)

predicted_value = rf.predict(transformed_features)
predicted_value


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but StandardScaler was fitted with feature names



array([13525.22])

In [None]:
data[(data['Area']=='Saudi Arabia') & (data['Year']==2000) & (data['Item']=='Sorghum')]

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item,hg/ha_yield
24233,2000,59.0,3024.11,26.62,Saudi Arabia,Sorghum,13384
24234,2000,59.0,3024.11,26.55,Saudi Arabia,Sorghum,13384


In [None]:
dr = DecisionTreeRegressor()
dr.fit(X_train_dummy,y_train)
dr.predict(X_test_dummy)


array([154330.,  15838.,  72614., ...,  52692.,   9621., 132600.],
      shape=(5187,))

In [None]:
Year = 2000
average_rain_fall_mm_per_year = 59.0
pesticides_tonnes = 3024.11
avg_temp = 26.55
Area = 'Saudi Arabia'
Item = 'Sorghum'

input_feature = np.array([[Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item]])

transformed_features = preprocessor.transform(input_feature).reshape(1,-1)

predicted_value = dr.predict(transformed_features)
predicted_value[0]


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but StandardScaler was fitted with feature names



np.float64(13384.0)

In [None]:
# save the model
import joblib

In [None]:
joblib.dump(dr,'yield_model') # save the model
joblib.dump(preprocessor,'preprocessor') # save the preprocessor

['preprocessor']

In [None]:
model = joblib.load('yield_model')

In [None]:
data.head(2)

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item,hg/ha_yield
0,1990,1485.0,121.0,16.37,Albania,Maize,36613
1,1990,1485.0,121.0,16.37,Albania,Potatoes,66667


In [None]:
Year = 1990
average_rain_fall_mm_per_year = 1485.0
pesticides_tonnes = 121.0
avg_temp = 16.37
Area = 'Albania'
Item = 'Potatoes'

In [None]:
features = np.array([[Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item]])

transformed = preprocessor.transform(features).reshape(1,-1)
model.predict(transformed)[0]


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but StandardScaler was fitted with feature names



np.float64(66667.0)

In [None]:
data['Year'].unique()

array([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013])

In [None]:
data.tail(1)

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area,Item,hg/ha_yield
28241,2013,657.0,2550.07,19.76,Zimbabwe,Wheat,22888
