In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option("display.max_columns" , None)
pd.set_option("display.max_rows" , None)

# #Read train data

In [2]:
soil_data = pd.read_excel(r"C:\Users\home\Desktop\Skillenza\Stage3\train.xlsx")

In [3]:
soil_data.head()

Unnamed: 0,ID,Insects,Crop,Soil,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season,Crop_status
0,1,188,Feed,clay,1,0,0.0,0,1,0
1,2,209,Feed,clay,1,0,0.0,0,2,1
2,3,257,Feed,clay,1,0,0.0,0,2,1
3,4,257,Feed,silt,1,0,0.0,0,2,1
4,5,342,Feed,clay,1,0,0.0,0,2,1


## #Removing id column

In [4]:
soil_data1 = soil_data.drop(["ID"] , axis = 1)

In [5]:
soil_data1.head(2)

Unnamed: 0,Insects,Crop,Soil,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season,Crop_status
0,188,Feed,clay,1,0,0.0,0,1,0
1,209,Feed,clay,1,0,0.0,0,2,1


## #Checking null counts

In [6]:
soil_data1.isnull().sum()

Insects                          0
Crop                             0
Soil                             0
Category_of_Toxicant             0
Does_count                       0
Number_of_Weeks_Used          8055
Number_Weeks_does_not used       0
Season                           0
Crop_status                      0
dtype: int64

In [7]:
soil_data1.shape

(80000, 9)

## #Replacing nulls with respective values

In [8]:
#soil_data1.Number_of_Weeks_Used.value_counts()

In [9]:
soil_data1.Number_of_Weeks_Used = soil_data1.Number_of_Weeks_Used.fillna(soil_data1.Number_of_Weeks_Used.mean())

In [10]:
soil_data1.isnull().sum()

Insects                       0
Crop                          0
Soil                          0
Category_of_Toxicant          0
Does_count                    0
Number_of_Weeks_Used          0
Number_Weeks_does_not used    0
Season                        0
Crop_status                   0
dtype: int64

In [11]:
soil_data1.Crop.value_counts()

Food    57333
Feed    22667
Name: Crop, dtype: int64

In [12]:
soil_data1.Soil.value_counts()

clay    43556
silt    36444
Name: Soil, dtype: int64

## #Converting objects to integers

In [13]:
soil_data1.Crop.replace({"Feed" : 0 , "Food" : 1} , inplace = True)
soil_data1.Soil.replace({"clay" : 0 , "silt" : 1} , inplace = True)

## #Splitting train data into X and Y

In [14]:
soil_data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Insects                     80000 non-null  int64  
 1   Crop                        80000 non-null  int64  
 2   Soil                        80000 non-null  int64  
 3   Category_of_Toxicant        80000 non-null  int64  
 4   Does_count                  80000 non-null  int64  
 5   Number_of_Weeks_Used        80000 non-null  float64
 6   Number_Weeks_does_not used  80000 non-null  int64  
 7   Season                      80000 non-null  int64  
 8   Crop_status                 80000 non-null  int64  
dtypes: float64(1), int64(8)
memory usage: 5.5 MB


In [15]:
X = soil_data1.iloc[:, [0,1,2,3,4,5,6,7]]
Y = soil_data1.iloc[:, -1]

In [16]:
X.head()

Unnamed: 0,Insects,Crop,Soil,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season
0,188,0,0,1,0,0.0,0,1
1,209,0,0,1,0,0.0,0,2
2,257,0,0,1,0,0.0,0,2
3,257,0,1,1,0,0.0,0,2
4,342,0,0,1,0,0.0,0,2


In [17]:
Y.head()

0    0
1    1
2    1
3    1
4    1
Name: Crop_status, dtype: int64

### #Train test split 

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test , Y_train , Y_test = train_test_split(X , Y , test_size=.2, random_state = 0)

# #Model Building using

# Gradient Boosting---------------

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()


## #Fit the model

In [21]:
gbc.fit(X_train , Y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

## #Prediction

In [22]:
pred_value = gbc.predict(X_test)
pred_value


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## #Confusion Matrix 

In [23]:
from sklearn.metrics import confusion_matrix

In [24]:
cm1 = confusion_matrix(pred_value , Y_test)
cm1

array([[13214,  1916,   330],
       [  143,   308,    88],
       [    1,     0,     0]], dtype=int64)

In [25]:
cm1.diagonal().sum() * 100 / cm1.sum()

84.5125

# #Read test data 

In [26]:
soil_test = pd.read_excel(r"C:\Users\home\Desktop\Skillenza\Stage3\test.xlsx")

In [27]:
soil_test.head()

Unnamed: 0,ID,Insects,Crop,Soil,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season
0,1,188,Feed,silt,1,0,,0,2
1,2,410,Feed,silt,1,0,0.0,0,2
2,3,626,Feed,clay,1,0,0.0,0,2
3,4,731,Feed,clay,1,0,0.0,0,2
4,5,789,Food,clay,1,0,0.0,0,1


In [28]:
soil_test.shape

(35000, 9)

## #Removing ID column

In [29]:
soil_test1 =soil_test.drop(["ID"] , axis = 1)

## #Checking nulls in test data

In [30]:
soil_test1.isnull().sum()

Insects                          0
Crop                             0
Soil                             0
Category_of_Toxicant             0
Does_count                       0
Number_of_Weeks_Used          3542
Number_Weeks_does_not used       0
Season                           0
dtype: int64

## #Replacing nulls with respective values

In [31]:
soil_test1.Number_of_Weeks_Used = soil_test1.Number_of_Weeks_Used.fillna(soil_test1.Number_of_Weeks_Used.mean())

In [32]:
soil_test1.isnull().sum()

Insects                       0
Crop                          0
Soil                          0
Category_of_Toxicant          0
Does_count                    0
Number_of_Weeks_Used          0
Number_Weeks_does_not used    0
Season                        0
dtype: int64

In [33]:
soil_test1.head()

Unnamed: 0,Insects,Crop,Soil,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season
0,188,Feed,silt,1,0,28.775701,0,2
1,410,Feed,silt,1,0,0.0,0,2
2,626,Feed,clay,1,0,0.0,0,2
3,731,Feed,clay,1,0,0.0,0,2
4,789,Food,clay,1,0,0.0,0,1


## #Converting all objects into integers

In [34]:
soil_test1.Crop.replace({"Feed" : 0 , "Food" : 1} , inplace = True)
soil_test1.Soil.replace({"clay" : 0 , "silt" : 1} , inplace = True)

In [35]:
soil_test1.shape

(35000, 8)

In [36]:
soil_test1_x = soil_test1.iloc[: , 0:]

In [37]:
soil_test1_x.head()

Unnamed: 0,Insects,Crop,Soil,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season
0,188,0,1,1,0,28.775701,0,2
1,410,0,1,1,0,0.0,0,2
2,626,0,0,1,0,0.0,0,2
3,731,0,0,1,0,0.0,0,2
4,789,1,0,1,0,0.0,0,1


In [38]:
X1 = soil_data1.iloc[:, [0,1,2,3,4,5,6,7]]
Y1 = soil_data1.iloc[:,-1]

In [39]:
X1.head()

Unnamed: 0,Insects,Crop,Soil,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season
0,188,0,0,1,0,0.0,0,1
1,209,0,0,1,0,0.0,0,2
2,257,0,0,1,0,0.0,0,2
3,257,0,1,1,0,0.0,0,2
4,342,0,0,1,0,0.0,0,2


In [40]:
Y1.head()

0    0
1    1
2    1
3    1
4    1
Name: Crop_status, dtype: int64

## #fit the test model 

In [41]:
gbc.fit(X1, Y1)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

## #Prediction

In [42]:
pred_test = gbc.predict(soil_test1_x)

# #Final DataFrame

In [43]:
df = pd.DataFrame(pred_test)

In [44]:
df = pd.concat([soil_test.ID , df] , axis = 1)

In [45]:
df.head()

Unnamed: 0,ID,0
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1


## #Renaming the column

In [46]:
df.rename(columns = {0 : "Crop_status"} , inplace =True)

In [47]:
df.head()

Unnamed: 0,ID,Crop_status
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1


# #Final output to csv file 

In [50]:
writer =  pd.ExcelWriter("Stage3.xlsx")

In [51]:
df.to_excel(writer)

writer.save()

print("Dataframe has been successfully written to csv file")

Dataframe has been successfully written to csv file
