In [5]:
# import libraries
import boto3
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
my_region = 'us-east-1' # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

ValueError: Must setup local AWS configuration with a region supported by SageMaker.

In [10]:
bucket_name = 'dsde-sireesha-chimbili' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

In [6]:
import numpy as np 
import pandas as pd 

In [10]:
df_yield = pd.read_csv('s3://dsde-sireesha-chimbili/yield.csv')
df_yield.shape

PermissionError: Forbidden

In [11]:
df_yield.head()

NameError: name 'df_yield' is not defined

In [None]:
# rename columns.
df_yield = df_yield.rename(index=str, columns={"Value": "hg/ha_yield"})
df_yield.head()

In [None]:
# drop unwanted columns.
df_yield = df_yield.drop(['Year Code','Element Code','Element','Year Code','Area Code','Domain Code','Domain','Unit','Item Code'], axis=1)
df_yield.head()

In [None]:
df_yield.describe()

In [None]:
df_yield.info()

In [None]:
df_rain = pd.read_csv('s3://dsde-sireesha-chimbili/rainfall.csv')

df_rain.head()

In [None]:
df_rain = df_rain.rename(index=str, columns={" Area": 'Area'})

In [None]:
# check data types 
df_rain.info()

In [None]:
# convert average_rain_fall_mm_per_year from object to float
df_rain['average_rain_fall_mm_per_year'] = pd.to_numeric(df_rain['average_rain_fall_mm_per_year'],errors = 'coerce')
df_rain.info()

In [None]:
df_rain = df_rain.dropna()

In [None]:
df_rain.describe()

In [None]:
# merge yield dataframe with rain dataframe by year and area columns 
yield_df_updated = pd.merge(df_yield, df_rain, on=['Year','Area'])

In [None]:
yield_df_updated.shape

In [None]:
yield_df_updated.head()

In [None]:
yield_df_updated.describe()

In [None]:
df_pes = pd.read_csv('s3://dsde-sireesha-chimbili/pesticides.csv')
df_pes.head()

In [None]:
df_pes = df_pes.rename(index=str, columns={"Value": "pesticides_tonnes"})
df_pes = df_pes.drop(['Element','Domain','Unit','Item'], axis=1)
df_pes.head()

In [None]:
df_pes.describe()

In [None]:
df_pes.info()

In [None]:
# merge Pesticides dataframe with yield dataframe 
yield_df_updated = pd.merge(yield_df_updated, df_pes, on=['Year','Area'])
yield_df_updated.shape

In [None]:
yield_df_updated.head()

In [None]:
avg_temp=  pd.read_csv('s3://dsde-sireesha-chimbili/temp.csv')

In [None]:
avg_temp.head()

In [None]:
avg_temp.describe()

In [None]:
avg_temp = avg_temp.rename(index=str, columns={"year": "Year", "country":'Area'})
avg_temp.head()

In [None]:
yield_df_updated = pd.merge(yield_df_updated,avg_temp, on=['Area','Year'])
yield_df_updated.head()

In [None]:
yield_df_updated.shape

In [None]:
yield_df_updated.describe()

In [None]:
yield_df_updated.isnull().sum()

In [None]:
yield_df_updated.groupby('Item').count()

In [None]:
yield_df_updated.describe()

In [None]:
yield_df_updated['Area'].nunique()

In [None]:
yield_df_updated.groupby(['Area'],sort=True)['hg/ha_yield'].sum().nlargest(10)

In [None]:
yield_df_updated.groupby(['Item','Area'],sort=True)['hg/ha_yield'].sum().nlargest(10)

In [None]:
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
correlation_data=yield_df_updated.select_dtypes(include=[np.number]).corr()

mask = np.zeros_like(correlation_data, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.palette="vlag"

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(correlation_data, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
#Data Prepocessing

In [None]:
yield_df_updated.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

yield_df_onehot = pd.get_dummies(yield_df_updated, columns=['Area',"Item"], prefix = ['Country',"Item"])
features=yield_df_onehot.loc[:, yield_df_onehot.columns != 'hg/ha_yield']
label=yield_df_updated['hg/ha_yield']
features.head()

In [None]:
features = features.drop(['Year'], axis=1)

In [None]:
features.info()

In [None]:
features.head()

In [None]:
# Scaling
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
features=scaler.fit_transform(features)
features

In [None]:
#write final df to csv file 
#yield_df_updated.to_csv('C:/Users/Sireesha Chimbili/Documents/Crop Data/yield_df_updated_me.csv')

In [None]:
#Training
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(features, label, test_size=0.25, random_state=42)

In [None]:
#Modeling
from sklearn.metrics import r2_score
def compare_models(model):
    model_name = model.__class__.__name__
    fit=model.fit(train_data,train_labels)
    y_pred=fit.predict(test_data)
    r2=r2_score(test_labels,y_pred)
    return([model_name,r2])

In [None]:
# import necessary libraries

# import boto3
# import pandas as pd
# import numpy as np
# import sagemaker
# from sagemaker import get_execution_role
# from sagemaker.amazon.amazon_estimator import get_image_uri

# set up the SageMaker session and role
# sagemaker_session = sagemaker.Session()
# role = sagemaker.get_execution_role()

# # set up the training data location
# #train_data = 's3://path/to/training/data'

# container = get_image_uri(sagemaker_session.boto_region_name, 'decision-trees')

# dt = sagemaker.estimator.Estimator(container,
#                                    role,
#                                    train_instance_count=1,
#                                    train_instance_type='ml.m4.xlarge',
#                                    output_path='s3://dsde-sireesha-chimbili/output',
#                                    sagemaker_session=sagemaker_session)

# dt.set_hyperparameters(max_depth=5,
#                        min_samples_split=5,
#                        min_samples_leaf=5)

# dt.fit({'train': train_data, 'validation': test_data})




# create a decision tree regressor estimator
# dt_estimator = sagemaker.estimator.Estimator(
#     image_uri=sagemaker.image_uris.retrieve("decision-tree-regressor", 'us-east-1'),
#     role=role,
#     instance_count=1,
#     instance_type='ml.m4.xlarge',
#     output_path='s3://dsde-sireesha-chimbili/output',
#     sagemaker_session=sagemaker_session,
#     base_job_name='decision-tree-regressor'
# )

# # set the hyperparameters
# dt_estimator.set_hyperparameters(
#     max_leaf_nodes=30
# )

# # fit the model
# dt_estimator.fit({'train': train_data})

# # deploy the model to an endpoint
# predictor = dt_estimator.deploy(
#     initial_instance_count=1,
#     instance_type='ml.m4.xlarge',
#     endpoint_name='decision-tree-regressor-endpoint'
# )


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor

models = [
    GradientBoostingRegressor(n_estimators=200, max_depth=3, random_state=0),
     RandomForestRegressor(n_estimators=200, max_depth=3, random_state=0),
    svm.SVR(),
   DecisionTreeRegressor()
]

In [None]:
model_train=list(map(compare_models,models)) 

In [None]:
print(*model_train, sep = "\n")

In [None]:
yield_df_onehot = yield_df_onehot.drop(['Year'], axis=1)

In [None]:
yield_df_onehot.head()

In [None]:
#setting test data to columns from dataframe and excluding 'hg/ha_yield' values where ML model should be predicting 

test_df=pd.DataFrame(test_data,columns=yield_df_onehot.loc[:, yield_df_onehot.columns != 'hg/ha_yield'].columns) 

# using stack function to return a reshaped DataFrame by pivoting the columns of the current dataframe

cntry=test_df[[col for col in test_df.columns if 'Country' in col]].stack()[test_df[[col for col in test_df.columns if 'Country' in col]].stack()>0]
cntrylist=list(pd.DataFrame(cntry).index.get_level_values(1))
countries=[i.split("_")[1] for i in cntrylist]
itm=test_df[[col for col in test_df.columns if 'Item' in col]].stack()[test_df[[col for col in test_df.columns if 'Item' in col]].stack()>0]
itmlist=list(pd.DataFrame(itm).index.get_level_values(1))
items=[i.split("_")[1] for i in itmlist]

In [None]:
test_df.head()

In [None]:
test_df.drop([col for col in test_df.columns if 'Item' in col],axis=1,inplace=True)
test_df.drop([col for col in test_df.columns if 'Country' in col],axis=1,inplace=True)
test_df.head()

In [None]:
test_df['Country']=countries
test_df['Item']=items
test_df.head()

In [None]:
dtree=DecisionTreeRegressor()
dtreemodel=dtree.fit(train_data,train_labels)

test_df["yield_predicted"]= dtreemodel.predict(test_data)
test_df["yield_actual"]=pd.DataFrame(test_labels)["hg/ha_yield"].tolist()
test_group=test_df.groupby("Item")
test_group.apply(lambda x: r2_score(x.yield_actual,x.yield_predicted))

In [None]:
# def model_assess(model, name='Default'):
#     model.fit(X_train, y_train)
#     preds = model.predict(X_test)
#     print('---', name, '---', '\n',
#           confusion_matrix(y_test, preds), '\n',print(y_test,preds),'/n',
#           'Accuracy:', round(accuracy_score(y_test, preds), 5), '\n')
    
# # Random Forest
# dtree = DecisionTreeRegressor()
# model_assess(dtree, 'Decision Tree Regressor')



In [None]:
import pickle
pickle.dump(dtreemodel,open('dtree_model.pkl','wb'))

In [None]:
# So let's run the model actual values against the predicted ones 

fig, ax = plt.subplots() 

ax.scatter(test_df["yield_actual"], test_df["yield_predicted"],edgecolors=(0, 0, 0))

ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
ax.set_title("Actual vs Predicted")
plt.show()

In [None]:
def adjusted_r_squared(y,yhat,x):
    score=1- (((1-(r2_score(y,yhat)))*(len(y)-1))/(len(y)-x.shape[1]-2))
    return score

test_group.apply(lambda x: adjusted_r_squared(x.yield_actual,x.yield_predicted,x))

In [None]:
varimp= {'imp':dtreemodel.feature_importances_,'names':yield_df_onehot.columns[yield_df_onehot.columns!="hg/ha_yield"]}

In [None]:
a4_dims = (8.27,16.7)

fig, ax = plt.subplots(figsize=a4_dims)
df=pd.DataFrame.from_dict(varimp)
df.sort_values(ascending=False,by=["imp"],inplace=True)
df=df.dropna()
sns.barplot(x="imp",y="names",palette="vlag",data=df,orient="h",ax=ax);

In [None]:
#7 most important factors that affect crops 
a4_dims = (16.7, 8.27)

fig, ax = plt.subplots(figsize=a4_dims)
df=pd.DataFrame.from_dict(varimp)
df.sort_values(ascending=False,by=["imp"],inplace=True)
df=df.dropna()
df=df.nlargest(7, 'imp')
sns.barplot(x="imp",y="names",palette="vlag",data=df,orient="h",ax=ax);

In [None]:
#Boxplot that shows yield for each item 
a4_dims = (16.7, 8.27)

fig, ax = plt.subplots(figsize=a4_dims)
sns.boxplot(x="Item",y="hg/ha_yield",palette="vlag",data=yield_df_updated,ax=ax);

In [None]:
# try:
#   urllib.request.urlretrieve ("https://www.kaggle.com/code/mdabidalam/climate-change-impact-on-yield-eda-viz-20349/input?select=yield_df.csv", "yield_df.csv")
#   print('Success: downloaded yield_df.csv.')
# except Exception as e:
#   print('Data load error: ',e)

# try:
#   sage_yield_df = pd.read_csv('s3://dsde-sireesha-chimbili/yield_df.csv')
#   print('Success: Data loaded into dataframe.')
# except Exception as e:
#     print('Data load error: ',e)