In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler  

In [None]:
df_yield = pd.read_csv('yield.csv')

In [None]:
df_yield.shape

In [None]:
df_yield.head(5)

In [None]:
df_yield.info

In [None]:
df_yield.rename( columns={"Value": "hg/ha_yield"}, inplace=True)

In [None]:
df_yield.head(5)

In [None]:
df_yield = df_yield.drop(['Year Code','Element Code', 'Element','Year Code','Area Code','Domain Code', 'Domain','Unit','Item Code'], axis=1) 

In [None]:
df_yield.head(5)

In [None]:
df_yield = df_yield.dropna()

In [None]:
##Load Climate Data

In [None]:
df_rain = pd.read_csv('rainfall.csv')

In [None]:
df_rain.head(5)

In [None]:
df_rain.head()

In [None]:
df_rain['average_rain_fall_mm_per_year'] = pd.to_numeric(df_rain['average_rain_fall_mm_per_year'],errors = 'coerce')  

In [None]:
df_rain = df_rain.dropna()

In [None]:
df_main = pd.merge(df_yield, df_rain, on=['Year','Area'], how='outer')

In [None]:
df_main.tail(5)

In [None]:
df_main.dropna()

In [None]:
dataframe_pesticide = pd.read_csv('pesticides.csv')

In [None]:
dataframe_pesticide.head()  

In [None]:
dataframe_pesticide = dataframe_pesticide.rename(index=str, columns={"Value": "pesticides_tonnes"}) 

In [None]:
dataframe_pesticide = dataframe_pesticide.drop(['Element','Domain','Unit','Item'], axis=1)  

In [None]:
dataframe_pesticide.head() 

In [None]:
df_main = pd.merge(df_main,dataframe_pesticide,on=['Year','Area'])

In [None]:
df_main.head(5)

In [None]:
dataframe_temp= pd.read_csv('temp.csv')  

In [None]:
dataframe_temp.head(5)  

In [None]:
dataframe_temp =dataframe_temp.dropna()

In [None]:
dataframe_temp = dataframe_temp.rename(index=str, columns={"year": "Year", "country": 'Area'})

In [None]:
dataframe_temp.head()

In [None]:
df_main = pd.merge(df_main,dataframe_temp,on=['Year','Area'])

In [None]:
df_main.head(5)

In [None]:
df_main.isnull().sum()

In [None]:
df_main.dropna()

In [None]:
df_main.isnull().sum()

In [None]:
df_main=df_main.dropna()  

In [None]:
df_main.isnull().sum()

In [None]:
df_main.groupby('Item').count()  

In [None]:
df_main_onehot = pd.get_dummies(df_main, columns=['Area',"Item"], prefix = ['Country',"Item"])

In [None]:
features=df_main_onehot.loc[:, df_main_onehot.columns != 'hg/ha_yield']  

In [None]:
label=df_main['hg/ha_yield']  
features.head() 

In [None]:
features = features.drop(['Year'], axis=1)  

In [None]:
features.head(3)

In [None]:
scaler = MinMaxScaler()  

In [None]:
features = scaler.fit_transform(features)

In [None]:
from sklearn.model_selection import train_test_split  
train_data, test_data, train_labels, test_labels = train_test_split(features, label, test_size=0.2, random_state=42)  

In [None]:
from sklearn.metrics import r2_score  
def compare_models(model):  
    model_name = model.__class__.__name__  
    fit=model.fit(train_data,train_labels)  
    y_pred=fit.predict(test_data)  
    r2=r2_score(test_labels,y_pred)  
    return([model_name,r2]) 

In [None]:
from sklearn.ensemble import RandomForestRegressor  
from sklearn.ensemble import GradientBoostingRegressor  
from sklearn import svm  
from sklearn.tree import DecisionTreeRegressor

In [None]:
models = [  
    GradientBoostingRegressor(n_estimators=200, max_depth=3, random_state=0), 
    RandomForestRegressor(n_estimators=200, max_depth=3, random_state=0),  
    svm.SVR(),  
   DecisionTreeRegressor()  
]

In [None]:
model_train=list(map(compare_models,models))

In [None]:
print(*model_train, sep = "\n")

In [None]:
df_main_onehot.head()

In [None]:
# converting test data to columns from the dataframe and omitting the values for "hg/ha yield," which the machine learning model should be predicting  
dataframe_test=pd.DataFrame(test_data,columns=df_main_onehot.loc[:, df_main_onehot.columns != 'hg/ha_yield'].columns)  
  
# utilizing the stack function to pivot the columns of the current dataframe and return a reshaped dataframe  
  
cntry=dataframe_test[[col for col in dataframe_test.columns if 'Country' in col]].stack()[dataframe_test[[col for col in dataframe_test.columns if 'Country' in col]].stack()>0]  
cntrylist=list(pd.DataFrame(cntry).index.get_level_values(1))  
countries=[i.split("_")[1] for i in cntrylist]  
itm=dataframe_test[[col for col in dataframe_test.columns if 'Item' in col]].stack()[dataframe_test[[col for col in dataframe_test.columns if 'Item' in col]].stack()>0]  
itmlist=list(pd.DataFrame(itm).index.get_level_values(1))  
items=[i.split("_")[1] for i in itmlist]  
  
  
dataframe_test.head()

In [None]:
dataframe_test.drop([col for col in dataframe_test.columns if 'Item' in col],axis=1,inplace=True)  
dataframe_test.drop([col for col in dataframe_test.columns if 'Country' in col],axis=1,inplace=True)  
dataframe_test.head() 

In [None]:
dataframe_test['Country']=countries  
dataframe_test['Item']=items  
dataframe_test.head() 

In [None]:
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
clf=DecisionTreeRegressor()  
model=clf.fit(train_data,train_labels)  
  
dataframe_test["yield_predicted"]= model.predict(test_data)  
dataframe_test["yield_actual"]=pd.DataFrame(test_labels)["hg/ha_yield"].tolist()  
test_group=dataframe_test.groupby("Item")  
  
# So let's compare the model's actual values to its predictions.  
  
fig, ax = plt.subplots()  
  
ax.scatter(dataframe_test["yield_actual"], dataframe_test["yield_predicted"],edgecolors=(0, 0, 0))
  
ax.set_xlabel('Actual')  
ax.set_ylabel('Predicted')  
ax.set_title("Actual vs Predicted")  
plt.show()  

In [None]:
import seaborn as sns
varimp= {'imp':model.feature_importances_,'names':df_main_onehot.columns[df_main_onehot.columns!="hg/ha_yield"]}  
  
a4_dims = (8.27,16.7)  
fig, ax = plt.subplots(figsize=a4_dims)  
df=pd.DataFrame.from_dict(varimp)  
df.sort_values(ascending=False,by=["imp"],inplace=True)  
dfdf=df.dropna()  
sns.barplot(x="imp",y="names",hue='names', palette="vlag",data=df,orient="h",ax=ax, legend=False);  

In [None]:
#7 most important factors that affect crops  
a4_dims = (16.7, 8.27)  
  
fig, ax = plt.subplots(figsize=a4_dims)  
df=pd.DataFrame.from_dict(varimp)  
df.sort_values(ascending=False,by=["imp"],inplace=True)  
dfdf=df.dropna()  
dfdf=df.nlargest(7, 'imp')  
sns.barplot(x="imp",y="names",hue='names', palette="vlag",data=df,orient="h",ax=ax);

In [None]:
#Boxplot that shows yield for each item  
a4_dims = (16.7, 8.27)  
  
fig, ax = plt.subplots(figsize=a4_dims)  
sns.boxplot(x="Item",y="hg/ha_yield",hue='hg/ha_yield', palette="vlag",data=df_yield, ax=ax);  