In [1]:
import json                                              # Will be needed for saving preprocessing details
import numpy as np                                       # For data manipulation
import pandas as pd                                      # For data manipulation
from sklearn.model_selection import train_test_split     # Will be used for data split
from sklearn.preprocessing import LabelEncoder           # For preprocessing
from sklearn.ensemble import RandomForestClassifier      # For training the algorithm
from sklearn.ensemble import ExtraTreesClassifier        # For training the algorithm
import joblib                                            # For saving algorithm and preprocessing objects

In [2]:
# Load the dataset from the raw file url.
df = pd.read_csv('https://raw.githubusercontent.com/SRaiz/Krishi-Karma/master/Final_Crop_Yield_Karnataka.csv', 
                 skipinitialspace=True)

x_cols = [col for col in df.columns if ((col != 'yield_class') & (col != 'key'))]

# Set the input data(matrix) and the target column
X = df[x_cols]
y = df['yield_class']

# Show first 5 rows of data
df.head()

Unnamed: 0,key,state_name,district_name,crop_year,season,crop,area,min_rainfall,max_rainfall,mean_rainfall,annual_rainfall,production,yield,yield_class
0,Karnataka-Bagalkot-Arhar/Tur-2000-Kharif,Karnataka,Bagalkot,2000,Kharif,Arhar/Tur,4984.0,0.0,173.079,71.463,857.558,3693.0,0.741,Good
1,Karnataka-Bagalkot-Bajra-2000-Kharif,Karnataka,Bagalkot,2000,Kharif,Bajra,41232.0,0.0,173.079,71.463,857.558,41300.0,1.002,Very Good
2,Karnataka-Bagalkot-Castor seed-2000-Kharif,Karnataka,Bagalkot,2000,Kharif,Castor seed,52.0,0.0,173.079,71.463,857.558,67.0,1.288,Good
3,Karnataka-Bagalkot-Cotton(lint)-2000-Kharif,Karnataka,Bagalkot,2000,Kharif,Cotton(lint),15767.0,0.0,173.079,71.463,857.558,23002.0,1.459,Good
4,Karnataka-Bagalkot-Groundnut-2000-Kharif,Karnataka,Bagalkot,2000,Kharif,Groundnut,14487.0,0.0,173.079,71.463,857.558,9664.0,0.667,Good


In [3]:
# Before processing our data we will be splitting it into a testing and training data set. We will use 30% for testing

# data split train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [4]:
train_mode = dict(X_train.mode().iloc[0])
print(train_mode)

{'state_name': 'Karnataka', 'district_name': 'Davangere', 'crop_year': 2002.0, 'season': 'Kharif     ', 'crop': 'Maize', 'area': 2.0, 'min_rainfall': 0.0, 'max_rainfall': 0.0, 'mean_rainfall': 0.0, 'annual_rainfall': 0.0, 'production': 2.0, 'yield': 1.0}


In [5]:
# Now we will be converting the categorical variables into numbers. I am using LabelEncoder from sklearn package 
# to do so
from sklearn import utils

# convert categoricals
encoders = {}
for column in ['state_name', 'district_name', 'season', 'crop']:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = X_train[column]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[column] = categorical_convert.fit_transform(X_train[column])


In [6]:
# train the Random Forest algorithm
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

In [7]:
# train the Extra Trees algorithm
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

In [8]:
# Now, we need to save the algorithm that we have created. The important thing to notice is that the ML algorithm 
# is not only the rf and et variable (with model weights), but there is also a need to save pre-processing 
# variables i.e. train_mode and encoders as well. For saving, we will use joblib package. 

# save the preprocessing variables and the RF and et algorithm
joblib.dump( train_mode, "./train_mode.joblib", compress=True )
joblib.dump( encoders, "./encoders.joblib", compress=True )
joblib.dump( rf, "./random_forest.joblib", compress=True )
joblib.dump( et, "./extra_trees.joblib", compress=True )

['./extra_trees.joblib']