In [54]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
import numpy as np

In [6]:
temp = pd.read_csv("temp.csv")
rainfall = pd.read_csv("rainfall.csv")
pesticides = pd.read_csv("pesticides.csv")
yeild = pd.read_csv("yield.csv")
temp.head()

Unnamed: 0,year,country,avg_temp
0,1849,Côte D'Ivoire,25.58
1,1850,Côte D'Ivoire,25.52
2,1851,Côte D'Ivoire,25.67
3,1852,Côte D'Ivoire,
4,1853,Côte D'Ivoire,


In [7]:
temp = temp.loc[temp["year"] >= 1990].copy()
temp.rename(columns={"year":"Year","country":"Country", "avg_temp":"Average Temperature"}, inplace=True)

In [10]:
first_merge = pd.merge(temp, yeild, how="inner", on=["Country", "Year"])
second_merge = pd.merge(pesticides, first_merge, how="inner", on=["Country", "Year"])
final_data = pd.merge(rainfall, second_merge, how="inner", on=["Country", "Year"])

In [12]:
final_data.head()

Unnamed: 0,Country,Year,Average Rainfall (mm/year),Pesticides (Tonnes),Average Temperature,Item,Yield (hg/ha)
0,Albania,1990,1485,121.0,16.37,Maize,36613
1,Albania,1990,1485,121.0,16.37,Potatoes,66667
2,Albania,1990,1485,121.0,16.37,"Rice, paddy",23333
3,Albania,1990,1485,121.0,16.37,Sorghum,12500
4,Albania,1990,1485,121.0,16.37,Soybeans,7000


In [14]:
country_count = dict(final_data.Country.value_counts())
country_count

{'India': 4048,
 'Brazil': 2277,
 'Mexico': 1472,
 'Pakistan': 1449,
 'Venezuela': 1150,
 'Russia': 1134,
 'Australia': 966,
 'Japan': 966,
 'Indonesia': 828,
 'South Africa': 644,
 'Turkey': 625,
 'Ecuador': 621,
 'Canada': 552,
 'Spain': 483,
 'Colombia': 414,
 'Argentina': 368,
 'Dominican Republic': 368,
 'Iran': 345,
 'Italy': 322,
 'Morocco': 322,
 'Egypt': 322,
 'Bangladesh': 294,
 'Iraq': 276,
 'Vietnam': 276,
 'Germany': 270,
 'Kazakhstan': 252,
 'Tanzania': 230,
 'United Kingdom': 230,
 'Cameroon': 230,
 'Kenya': 230,
 'Chile': 230,
 'Mali': 207,
 'Burundi': 207,
 'Uganda': 207,
 'Bolivia': 207,
 'Rwanda': 207,
 'Peru': 207,
 'Guatemala': 204,
 'Honduras': 204,
 'Sri Lanka': 184,
 'Madagascar': 184,
 'Saudi Arabia': 184,
 'Zimbabwe': 184,
 'Haiti': 184,
 'Burkina Faso': 184,
 'El Salvador': 184,
 'Zambia': 184,
 'Nicaragua': 183,
 'Malawi': 171,
 'Guinea': 171,
 'Angola': 164,
 'Thailand': 161,
 'Mozambique': 161,
 'Uruguay': 161,
 'Central African Republic': 161,
 'Mauritani

In [15]:
def country_modifier(x, val_counts):
    """Modify a Country."""
    if val_counts[x] >= 600:
        return x
    else:
        return "Other"

final_data["Country_New"] = final_data["Country"].apply(
    country_modifier, val_counts=country_count
)

In [17]:
features_data = final_data[
    [
        "Country_New",
        "Average Rainfall (mm/year)",
        "Pesticides (Tonnes)",
        "Average Temperature",
        "Item",
    ]
]
response = final_data["Yield (hg/ha)"]

In [18]:
# main test split
X_train, X_test, y_train, y_test = train_test_split(
    features_data,
    response,
    test_size=0.2,
    random_state=42,
    stratify=features_data[["Country_New", "Item"]],
)

In [19]:
categorical_data = X_train[["Country_New", "Item"]]
numerical_data = X_train[
    ["Average Rainfall (mm/year)", "Pesticides (Tonnes)", "Average Temperature"]
]

In [20]:
# One hot encoder
ohe = OneHotEncoder(handle_unknown="error")
ohe.fit(categorical_data)
ohe_data = ohe.transform(categorical_data).toarray()

In [21]:
#Standarzizing data
std = StandardScaler()
std.fit(numerical_data)
num_data = std.transform(numerical_data)

In [22]:
train = np.hstack((num_data, ohe_data))

In [23]:
# validation split
X_train_new, X_val, y_train_new, y_val = train_test_split(
    train, y_train, test_size=0.2, random_state=42
)

In [26]:
LR_model = LinearRegression()
LR_model.fit(X_train_new, y_train_new)
y_pred = LR_model.predict(X_val)
LR_model_mse = mean_squared_error(y_val, y_pred)
LR_model_r2 = r2_score(y_val, y_pred)
LR_model_r2

0.6717178579433188

In [28]:
RF_model = RandomForestRegressor()
RF_model.fit(X_train_new, y_train_new)
y_pred = RF_model.predict(X_val)
RF_model_mse = mean_squared_error(y_val, y_pred)
RF_model_r2 = r2_score(y_val, y_pred)
RF_model_r2

0.9666757743879584

In [43]:
GBM_model = GradientBoostingRegressor(n_estimators=600)
GBM_model.fit(X_train_new, y_train_new)
y_pred = GBM_model.predict(X_val)
GBM_model_mse = mean_squared_error(y_val, y_pred)
GBM_model_r2 = r2_score(y_val, y_pred)
GBM_model_r2

0.9143133661483394

In [44]:
MLP_model = MLPRegressor(max_iter=200)
MLP_model.fit(X_train_new, y_train_new)
y_pred = MLP_model.predict(X_val)
MLP_model_mse = mean_squared_error(y_val, y_pred)
MLP_model_r2 = r2_score(y_val, y_pred)
MLP_model_r2



0.34245361422350495

In [45]:
DTR_model = DecisionTreeRegressor()
DTR_model.fit(X_train_new, y_train_new)
y_pred = DTR_model.predict(X_val)
DTR_model_mse = mean_squared_error(y_val, y_pred)
DTR_model_r2 = r2_score(y_val, y_pred)
DTR_model_r2

0.9463844951038567

In [49]:
Lasso_model = linear_model.Lasso(alpha=0.1,max_iter=50000)
Lasso_model.fit(X_train_new, y_train_new)
y_pred = Lasso_model.predict(X_val)
Lasso_model_mse = mean_squared_error(y_val, y_pred)
Lasso_model_r2 = r2_score(y_val, y_pred)
Lasso_model_r2

0.671718026316422

In [55]:
# Define the model

RF_model = RandomForestRegressor()

# Define the grid

param_grid = {
'n_estimators': [50, 100, 200, 300],
'min_samples_leaf': [1, 5, 10],
'max_depth': [2, 4, 6, 8, 10],
'max_features': ['auto', 'sqrt'],
'bootstrap': [True, False]}

# Instantiate GridSearchCV

model_gridsearch = GridSearchCV(
estimator=RF_model,
param_grid=param_grid,
scoring='r2',
n_jobs=-1,
cv=2,
refit=True,
return_train_score=True)

In [56]:
model_gridsearch.fit(train, y_train)

GridSearchCV(cv=2, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [2, 4, 6, 8, 10],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 5, 10],
                         'n_estimators': [50, 100, 200, 300]},
             return_train_score=True, scoring='r2')

In [57]:
print(model_gridsearch.best_estimator_)


RandomForestRegressor(max_depth=10, n_estimators=200)


In [58]:
final_RF_model = RandomForestRegressor(max_depth=10, n_estimators=200)
final_RF_model.fit(train, y_train)

RandomForestRegressor(max_depth=10, n_estimators=200)

In [60]:
categorical_test_data = X_test[["Country_New", "Item"]]
numerical_test_data = X_test[
    ["Average Rainfall (mm/year)", "Pesticides (Tonnes)", "Average Temperature"]
]

In [62]:
# One hot encoder
ohe = OneHotEncoder(handle_unknown="error")
ohe.fit(categorical_data)
ohe_test_data = ohe.transform(categorical_test_data).toarray()

#Standarzizing data
std = StandardScaler()
std.fit(numerical_data)
num_test_data = std.transform(numerical_test_data)

In [63]:
test = np.hstack((num_test_data, ohe_test_data))

In [66]:
test_predictions = final_RF_model.predict(test)

In [68]:
print(r2_score(y_test,test_predictions))

0.9408217869742267
