# Steps
# 1 Import the Libraries
# 2 Load the data
# 3 Preprocessing
* 3.1 OneHotEncoding
* 3.2 Standard Scaling
# 4 Data preparation
# 5 Model Training
# 6 Model Evaluation

# 1 Import the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
import warnings
warnings.filterwarnings('ignore')

# 2 Load the data

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
print(train_data.shape) # target column is 'Fertilizer Name'
print(test_data.shape)
print(train_data.columns) 

(750000, 10)
(250000, 9)
Index(['id', 'Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
       'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name'],
      dtype='object')


In [4]:
##################################
##club both train and test data ##
##################################
test_data['Fertilizer Name'] = 'x'
df = pd.concat([train_data, test_data], axis = 0)
df.set_index('id', inplace = True)
print(df.shape) # Ten lakh rows

(1000000, 9)


In [5]:
df

Unnamed: 0_level_0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,27,69,65,Sandy,Millets,30,6,18,28-28
2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...
999995,26,66,30,Red,Sugarcane,14,7,18,x
999996,33,62,55,Red,Pulses,28,14,7,x
999997,36,53,64,Black,Paddy,28,11,27,x
999998,36,67,26,Clayey,Paddy,33,0,10,x


In [6]:
col_dict = {
    'Temparature': 'temperature',
    'Humidity': 'humidity',
    'Moisture':'moisture',
    'Soil Type' : 'soil_type',
    'Crop Type' : 'crop_type',
    'Nitrogen' : 'nitrogen',
    'Potassium' : 'potassium',
    'Phosphorous' : 'phosphorous',
    'Fertilizer Name' : 'fertilizer_name'
}

df = df.rename(columns = col_dict)

In [7]:
df.columns

Index(['temperature', 'humidity', 'moisture', 'soil_type', 'crop_type',
       'nitrogen', 'potassium', 'phosphorous', 'fertilizer_name'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   temperature      1000000 non-null  int64 
 1   humidity         1000000 non-null  int64 
 2   moisture         1000000 non-null  int64 
 3   soil_type        1000000 non-null  object
 4   crop_type        1000000 non-null  object
 5   nitrogen         1000000 non-null  int64 
 6   potassium        1000000 non-null  int64 
 7   phosphorous      1000000 non-null  int64 
 8   fertilizer_name  1000000 non-null  object
dtypes: int64(6), object(3)
memory usage: 76.3+ MB


In [9]:
round(df.describe(),1)

Unnamed: 0,temperature,humidity,moisture,nitrogen,potassium,phosphorous
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,31.5,61.0,45.2,23.1,9.5,21.1
std,4.0,6.6,11.8,11.2,5.8,12.4
min,25.0,50.0,25.0,4.0,0.0,0.0
25%,28.0,55.0,35.0,13.0,4.0,10.0
50%,32.0,61.0,45.0,23.0,9.0,21.0
75%,35.0,67.0,55.0,33.0,14.0,32.0
max,38.0,72.0,65.0,42.0,19.0,42.0


In [10]:
df.isna().sum() # No null values

temperature        0
humidity           0
moisture           0
soil_type          0
crop_type          0
nitrogen           0
potassium          0
phosphorous        0
fertilizer_name    0
dtype: int64

In [11]:
df.duplicated().sum() # No duplicated rows
#df.drop_duplicates() ->  to remove duplicates

np.int64(0)

In [12]:
df.head(2)

Unnamed: 0_level_0,temperature,humidity,moisture,soil_type,crop_type,nitrogen,potassium,phosphorous,fertilizer_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,27,69,65,Sandy,Millets,30,6,18,28-28


In [13]:
print(df['soil_type'].value_counts()) # 5 soil types
print('--------------------------------')
print(df['crop_type'].value_counts()) # 11 crop types
print('--------------------------------')
df['fertilizer_name'].value_counts()  # 7 fertilizers types

soil_type
Sandy     209033
Black     201103
Clayey    198174
Red       197461
Loamy     194229
Name: count, dtype: int64
--------------------------------
crop_type
Paddy          114086
Pulses         104401
Cotton          92262
Tobacco         90728
Wheat           88722
Millets         87180
Barley          86761
Sugarcane       85836
Oil seeds       85711
Maize           83472
Ground Nuts     80841
Name: count, dtype: int64
--------------------------------


fertilizer_name
x           250000
14-35-14    114436
10-26-26    113887
17-17-17    112453
28-28       111158
20-20       110889
DAP          94860
Urea         92317
Name: count, dtype: int64

In [14]:
fname_dict = {
    'x' : 7,
    '14-35-14': 1,
    '10-26-26' : 2,
    '17-17-17' : 3,
    '28-28' : 4,
    '20-20' : 5,
    'DAP' : 6, 
    'Urea' : 0   
}

df['fertilizer_name'] = df['fertilizer_name'].map(fname_dict)
df['fertilizer_name'].value_counts()

fertilizer_name
7    250000
1    114436
2    113887
3    112453
4    111158
5    110889
6     94860
0     92317
Name: count, dtype: int64

In [15]:
df['fertilizer_name']

id
0         4
1         4
2         3
3         2
4         6
         ..
999995    7
999996    7
999997    7
999998    7
999999    7
Name: fertilizer_name, Length: 1000000, dtype: int64

# 3 Preprocessing

## 3.1 OneHotEncoding with dummy variables

In [16]:
category_column_names = [col for col in df.iloc[:, :8].columns if df[col].dtype ==  'O'] #['soil_type', 'crop_type']
numeric_column_names = [ col   for col in df.iloc[:, :8].columns if df[col].dtype != 'object'] # 'O' or 'object' -> same
# ['temperature', 'humidity', 'moisture', 'nitrogen', 'potassium', 'phosphorous']

In [17]:
dummy_df = pd.get_dummies(df[category_column_names], dtype = int) # Converted into bool
merged_df = pd.concat([df, dummy_df], axis = 1)
merged_df.drop(category_column_names, axis = 1, inplace = True) # droped original columns

In [18]:
merged_df.shape # 22 independent features and 1 dependent feature

(1000000, 23)

In [19]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 999999
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype
---  ------                 --------------    -----
 0   temperature            1000000 non-null  int64
 1   humidity               1000000 non-null  int64
 2   moisture               1000000 non-null  int64
 3   nitrogen               1000000 non-null  int64
 4   potassium              1000000 non-null  int64
 5   phosphorous            1000000 non-null  int64
 6   fertilizer_name        1000000 non-null  int64
 7   soil_type_Black        1000000 non-null  int64
 8   soil_type_Clayey       1000000 non-null  int64
 9   soil_type_Loamy        1000000 non-null  int64
 10  soil_type_Red          1000000 non-null  int64
 11  soil_type_Sandy        1000000 non-null  int64
 12  crop_type_Barley       1000000 non-null  int64
 13  crop_type_Cotton       1000000 non-null  int64
 14  crop_type_Ground Nuts  1000000 non-null  int64
 15  crop

In [20]:
merged_df.corr()['fertilizer_name']

temperature             -0.001021
humidity                 0.003147
moisture                 0.002780
nitrogen                 0.001770
potassium               -0.001834
phosphorous             -0.000665
fertilizer_name          1.000000
soil_type_Black          0.000148
soil_type_Clayey         0.001346
soil_type_Loamy         -0.001301
soil_type_Red           -0.002081
soil_type_Sandy          0.001837
crop_type_Barley        -0.002639
crop_type_Cotton         0.000342
crop_type_Ground Nuts   -0.001857
crop_type_Maize         -0.003438
crop_type_Millets       -0.002713
crop_type_Oil seeds      0.001418
crop_type_Paddy          0.000538
crop_type_Pulses        -0.001663
crop_type_Sugarcane      0.002596
crop_type_Tobacco        0.004636
crop_type_Wheat          0.002632
Name: fertilizer_name, dtype: float64

## 3.2 Standard Scaling

In [21]:
from sklearn.preprocessing import StandardScaler #Class imported

scalar = StandardScaler()
merged_df_t = scalar.fit_transform(merged_df.drop(['fertilizer_name'], axis = 1))
merged_df_t

array([[ 1.3662452 ,  1.34834076, -0.77883206, ...,  3.26345269,
        -0.31588119, -0.31202559],
       [-1.11810167,  1.19784851,  1.67999831, ..., -0.30642393,
        -0.31588119, -0.31202559],
       [-0.6212323 ,  0.29489499, -1.11798108, ..., -0.30642393,
        -0.31588119, -0.31202559],
       ...,
       [ 1.11781051, -1.21002754,  1.59521105, ..., -0.30642393,
        -0.31588119, -0.31202559],
       [ 1.11781051,  0.896864  , -1.6267046 , ..., -0.30642393,
        -0.31588119, -0.31202559],
       [ 0.86937582,  0.29489499, -0.77883206, ..., -0.30642393,
        -0.31588119, -0.31202559]])

# 4 Data Preparation

In [22]:
X = merged_df_t[:750000]
y = df['fertilizer_name'][:750000].values

In [23]:
#from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# 5 Model Training

In [24]:
model1 = RandomForestClassifier() # model creation
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

accuracy_score(y_test, y_pred) # 0.16578

0.16458666666666666

In [25]:
model2 = XGBClassifier()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)

accuracy_score(y_test, y_pred) # 0.19522

0.19522

In [26]:
model3 = LogisticRegression()
model3.fit(X_train, y_train)
y_pred = model3.predict(X_test)

accuracy_score(y_test, y_pred) # 0.15909333333333334

0.15909333333333334

In [27]:
model4 = DecisionTreeClassifier()
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)

accuracy_score(y_test, y_pred) # 0.15110666666666667

0.15218666666666666

In [28]:
model5 = LGBMClassifier()
model5.fit(X_train, y_train)
y_pred = model5.predict(X_test)

accuracy_score(y_test, y_pred) # 0.18944666666666668

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 234
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 22
[LightGBM] [Info] Start training from score -2.091474
[LightGBM] [Info] Start training from score -1.877350
[LightGBM] [Info] Start training from score -1.885565
[LightGBM] [Info] Start training from score -1.900582
[LightGBM] [Info] Start training from score -1.910836
[LightGBM] [Info] Start training from score -1.909654
[LightGBM] [Info] Start training from score -2.069993


0.18944666666666668

In [29]:
model6 = CatBoostClassifier(verbose=0)
model6.fit(X_train, y_train)
y_pred = model6.predict(X_test)

accuracy_score(y_test, y_pred) # 0.19248666666666667

0.19248666666666667

#### Insights 
* accuracy score XGB>catboost>LGB>RandomForest