In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Importing Train dataset

In [23]:
df=pd.read_csv('train.csv')
df

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,F00000001,188,1,0,1,0,0.0,0,1,0
1,F00000003,209,1,0,1,0,0.0,0,2,1
2,F00000004,257,1,0,1,0,0.0,0,2,1
3,F00000005,257,1,1,1,0,0.0,0,2,1
4,F00000006,342,1,0,1,0,0.0,0,2,1
...,...,...,...,...,...,...,...,...,...,...
88853,F00155935,3337,1,0,2,10,12.0,44,3,0
88854,F00155938,3516,1,0,2,10,20.0,38,1,0
88855,F00155939,3516,1,0,2,15,40.0,8,2,0
88856,F00155942,3702,1,0,2,10,25.0,18,3,0


In [24]:
df.isna().sum()

ID                            0
Estimated_Insects_Count       0
Crop_Type                     0
Soil_Type                     0
Pesticide_Use_Category        0
Number_Doses_Week             0
Number_Weeks_Used          9000
Number_Weeks_Quit             0
Season                        0
Crop_Damage                   0
dtype: int64

# Dealing with missing values

In [25]:
df1=df.fillna(df['Number_Weeks_Used'].median())
df1

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,F00000001,188,1,0,1,0,0.0,0,1,0
1,F00000003,209,1,0,1,0,0.0,0,2,1
2,F00000004,257,1,0,1,0,0.0,0,2,1
3,F00000005,257,1,1,1,0,0.0,0,2,1
4,F00000006,342,1,0,1,0,0.0,0,2,1
...,...,...,...,...,...,...,...,...,...,...
88853,F00155935,3337,1,0,2,10,12.0,44,3,0
88854,F00155938,3516,1,0,2,10,20.0,38,1,0
88855,F00155939,3516,1,0,2,15,40.0,8,2,0
88856,F00155942,3702,1,0,2,10,25.0,18,3,0


In [26]:
df1.isna().any()

ID                         False
Estimated_Insects_Count    False
Crop_Type                  False
Soil_Type                  False
Pesticide_Use_Category     False
Number_Doses_Week          False
Number_Weeks_Used          False
Number_Weeks_Quit          False
Season                     False
Crop_Damage                False
dtype: bool

In [27]:
df1['Crop_Damage'].value_counts()

0    74238
1    12307
2     2313
Name: Crop_Damage, dtype: int64

In [28]:
import random

# Dealing with class imbalance

In [29]:
a=df1[df1['Crop_Damage']==0].index
b=df1[df1['Crop_Damage']==1].index
c=df1[df1['Crop_Damage']==2].index


random.seed(0)

a=np.random.choice(a,size=len(c))
b=np.random.choice(b,size=len(c))
c=np.asarray(c)

new_indexes=np.concatenate((a,b,c))

df1_=df1.loc[new_indexes,]
df1_

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
9715,F00017178,410,1,0,2,10,9.0,13,2,0
69291,F00121530,2267,1,1,3,10,39.0,0,3,0
62515,F00109586,577,0,1,2,40,23.0,17,1,0
34501,F00060591,677,0,1,2,5,15.0,14,2,0
20955,F00036825,2139,1,0,3,10,21.0,0,3,0
...,...,...,...,...,...,...,...,...,...,...
88470,F00155221,3165,0,1,2,40,40.0,18,1,2
88479,F00155242,3516,0,1,2,25,50.0,4,3,2
88493,F00155265,3896,0,1,2,60,48.0,10,1,2
88552,F00155357,448,1,0,2,20,20.0,11,2,2


In [30]:
X=df1_.drop(columns=['ID','Crop_Damage'])
X

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
9715,410,1,0,2,10,9.0,13,2
69291,2267,1,1,3,10,39.0,0,3
62515,577,0,1,2,40,23.0,17,1
34501,677,0,1,2,5,15.0,14,2
20955,2139,1,0,3,10,21.0,0,3
...,...,...,...,...,...,...,...,...
88470,3165,0,1,2,40,40.0,18,1
88479,3516,0,1,2,25,50.0,4,3
88493,3896,0,1,2,60,48.0,10,1
88552,448,1,0,2,20,20.0,11,2


In [31]:
y=df1_['Crop_Damage']
y

9715     0
69291    0
62515    0
34501    0
20955    0
        ..
88470    2
88479    2
88493    2
88552    2
88741    2
Name: Crop_Damage, Length: 6939, dtype: int64

In [32]:
X.describe()

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
count,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0
mean,1622.383341,0.280012,0.445165,2.401787,25.657876,32.694625,7.059519,1.908056
std,884.919238,0.449037,0.49702,0.562063,15.00779,13.151675,9.471555,0.696669
min,150.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,916.0,0.0,0.0,2.0,15.0,25.0,0.0,1.0
50%,1478.0,0.0,0.0,2.0,20.0,32.0,1.0,2.0
75%,2139.0,1.0,1.0,3.0,40.0,42.0,13.0,2.0
max,4097.0,1.0,1.0,3.0,95.0,66.0,48.0,3.0


# Scaling X

In [33]:
s=MinMaxScaler()

In [34]:
new_X=pd.DataFrame(s.fit_transform(X))


In [77]:
new_X.columns=X.columns
new_X.index=X.index
new_X

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
9715,0.065873,1.0,0.0,0.5,0.105263,0.136364,0.270833,0.5
69291,0.536357,1.0,1.0,1.0,0.105263,0.590909,0.000000,1.0
62515,0.108183,0.0,1.0,0.5,0.421053,0.348485,0.354167,0.0
34501,0.133519,0.0,1.0,0.5,0.052632,0.227273,0.291667,0.5
20955,0.503927,1.0,0.0,1.0,0.105263,0.318182,0.000000,1.0
...,...,...,...,...,...,...,...,...
88470,0.763871,0.0,1.0,0.5,0.421053,0.606061,0.375000,0.0
88479,0.852800,0.0,1.0,0.5,0.263158,0.757576,0.083333,1.0
88493,0.949075,0.0,1.0,0.5,0.631579,0.727273,0.208333,0.0
88552,0.075500,1.0,0.0,0.5,0.210526,0.303030,0.229167,0.5


In [78]:
new_X.describe()

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
count,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0
mean,0.373039,0.280012,0.445165,0.700894,0.270083,0.495373,0.147073,0.454028
std,0.2242,0.449037,0.49702,0.281032,0.157977,0.199268,0.197324,0.348334
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.194071,0.0,0.0,0.5,0.157895,0.378788,0.0,0.0
50%,0.336458,0.0,0.0,0.5,0.210526,0.484848,0.020833,0.5
75%,0.503927,1.0,1.0,1.0,0.421053,0.636364,0.270833,0.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [79]:
y.describe()

count    6939.000000
mean        1.000000
std         0.816555
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max         2.000000
Name: Crop_Damage, dtype: float64

In [80]:
y.value_counts()

2    2313
1    2313
0    2313
Name: Crop_Damage, dtype: int64

In [81]:
y1=y.map({0:'Low',1:'Medium',2:'High'})
y2=pd.get_dummies(y1)
y2

Unnamed: 0,High,Low,Medium
9715,0,1,0
69291,0,1,0
62515,0,1,0
34501,0,1,0
20955,0,1,0
...,...,...,...
88470,1,0,0
88479,1,0,0
88493,1,0,0
88552,1,0,0


# HIGH

In [82]:
X_train,X_test,y_train,y_test=train_test_split(new_X,y2['High'],test_size=0.2,random_state=21)


In [83]:
X_train

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
62229,0.503674,0.0,0.0,0.5,0.210526,0.530303,0.312500,0.5
54523,0.414239,0.0,0.0,0.5,0.105263,0.212121,0.666667,0.0
52291,0.899924,0.0,0.0,0.5,0.105263,0.469697,0.458333,0.5
30808,0.606030,0.0,0.0,1.0,0.105263,0.606061,0.000000,1.0
27980,0.040790,0.0,1.0,1.0,0.526316,0.318182,0.000000,0.5
...,...,...,...,...,...,...,...,...
71667,0.387129,1.0,0.0,1.0,0.210526,0.742424,0.000000,0.5
23460,0.336458,1.0,0.0,1.0,0.052632,0.727273,0.000000,0.5
32783,0.161895,1.0,0.0,0.5,0.105263,0.348485,0.166667,1.0
50151,0.269065,0.0,1.0,0.5,0.210526,0.257576,0.312500,0.5


In [92]:
y_train

62229    0
54523    0
52291    0
30808    0
27980    0
        ..
71667    1
23460    0
32783    0
50151    1
25959    1
Name: High, Length: 5551, dtype: uint8

In [84]:
lr=LogisticRegression()

In [85]:
lr.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [86]:
predhigh=lr.predict_proba(X_test)
predhigh=predhigh[:,1]
predhigh

array([0.45345544, 0.3988246 , 0.48412285, ..., 0.16909383, 0.29032609,
       0.2354219 ])

# MEDIUM

In [93]:
y_train_med=y2['Medium'][y_train.index]
y_test_med=y2['Medium'][y_test.index]


In [94]:
y_train_med

62229    1
54523    0
52291    1
30808    1
27980    0
        ..
71667    0
23460    0
32783    0
50151    0
25959    0
Name: Medium, Length: 5916, dtype: uint8

In [91]:
X_train

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
62229,0.503674,0.0,0.0,0.5,0.210526,0.530303,0.312500,0.5
54523,0.414239,0.0,0.0,0.5,0.105263,0.212121,0.666667,0.0
52291,0.899924,0.0,0.0,0.5,0.105263,0.469697,0.458333,0.5
30808,0.606030,0.0,0.0,1.0,0.105263,0.606061,0.000000,1.0
27980,0.040790,0.0,1.0,1.0,0.526316,0.318182,0.000000,0.5
...,...,...,...,...,...,...,...,...
71667,0.387129,1.0,0.0,1.0,0.210526,0.742424,0.000000,0.5
23460,0.336458,1.0,0.0,1.0,0.052632,0.727273,0.000000,0.5
32783,0.161895,1.0,0.0,0.5,0.105263,0.348485,0.166667,1.0
50151,0.269065,0.0,1.0,0.5,0.210526,0.257576,0.312500,0.5


In [88]:
lr_medium=LogisticRegression()

In [89]:
lr_medium.fit(X_train,y_train_med)



ValueError: Found input variables with inconsistent numbers of samples: [5551, 5916]

In [51]:
predmedium=lr_medium.predict_proba(X_test)
predmedium=predmedium[:,1]
predmedium

array([0.35682547, 0.38084871, 0.44106824, ..., 0.31500719, 0.39495088,
       0.35475527])

# LOW

In [52]:
X_train,X_test,y_train,y_test=train_test_split(new_X,y2['Low'],test_size=0.2)


In [53]:
lr_low=LogisticRegression()

In [55]:
lr_low.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
predlow=lr_low.predict_proba(X_test)
predlow=predlow[:,1]
predlow

array([0.14790065, 0.13797333, 0.43097792, ..., 0.35557374, 0.6895121 ,
       0.60961405])

In [57]:
outcome=pd.DataFrame({'High':predhigh,'Medium':predmedium,'Low':predlow})
outcome

Unnamed: 0,High,Medium,Low
0,0.132678,0.356825,0.147901
1,0.348257,0.380849,0.137973
2,0.355142,0.441068,0.430978
3,0.056417,0.392059,0.115420
4,0.534650,0.399933,0.338362
...,...,...,...
1383,0.335638,0.272893,0.386886
1384,0.258918,0.286871,0.471587
1385,0.397843,0.315007,0.355574
1386,0.155697,0.394951,0.689512


In [63]:
outcome['result']=outcome.idxmax(axis=1)
outcome

Unnamed: 0,High,Medium,Low,result
0,0.132678,0.356825,0.147901,Medium
1,0.348257,0.380849,0.137973,Medium
2,0.355142,0.441068,0.430978,Medium
3,0.056417,0.392059,0.115420,Medium
4,0.534650,0.399933,0.338362,High
...,...,...,...,...
1383,0.335638,0.272893,0.386886,Low
1384,0.258918,0.286871,0.471587,Low
1385,0.397843,0.315007,0.355574,High
1386,0.155697,0.394951,0.689512,Low


In [64]:
outcome['result']=outcome['result'].map({'Low':0,'Medium':1,'High':2})
outcome

Unnamed: 0,High,Medium,Low,result
0,0.132678,0.356825,0.147901,1
1,0.348257,0.380849,0.137973,1
2,0.355142,0.441068,0.430978,1
3,0.056417,0.392059,0.115420,1
4,0.534650,0.399933,0.338362,2
...,...,...,...,...
1383,0.335638,0.272893,0.386886,0
1384,0.258918,0.286871,0.471587,0
1385,0.397843,0.315007,0.355574,2
1386,0.155697,0.394951,0.689512,0
