In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Importing Train dataset

In [2]:
df=pd.read_csv('train.csv')
df

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,F00000001,188,1,0,1,0,0.0,0,1,0
1,F00000003,209,1,0,1,0,0.0,0,2,1
2,F00000004,257,1,0,1,0,0.0,0,2,1
3,F00000005,257,1,1,1,0,0.0,0,2,1
4,F00000006,342,1,0,1,0,0.0,0,2,1
...,...,...,...,...,...,...,...,...,...,...
88853,F00155935,3337,1,0,2,10,12.0,44,3,0
88854,F00155938,3516,1,0,2,10,20.0,38,1,0
88855,F00155939,3516,1,0,2,15,40.0,8,2,0
88856,F00155942,3702,1,0,2,10,25.0,18,3,0


In [3]:
df.isna().sum()

ID                            0
Estimated_Insects_Count       0
Crop_Type                     0
Soil_Type                     0
Pesticide_Use_Category        0
Number_Doses_Week             0
Number_Weeks_Used          9000
Number_Weeks_Quit             0
Season                        0
Crop_Damage                   0
dtype: int64

# Dealing with missing values

In [4]:
df1=df.fillna(df['Number_Weeks_Used'].median())
df1

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,F00000001,188,1,0,1,0,0.0,0,1,0
1,F00000003,209,1,0,1,0,0.0,0,2,1
2,F00000004,257,1,0,1,0,0.0,0,2,1
3,F00000005,257,1,1,1,0,0.0,0,2,1
4,F00000006,342,1,0,1,0,0.0,0,2,1
...,...,...,...,...,...,...,...,...,...,...
88853,F00155935,3337,1,0,2,10,12.0,44,3,0
88854,F00155938,3516,1,0,2,10,20.0,38,1,0
88855,F00155939,3516,1,0,2,15,40.0,8,2,0
88856,F00155942,3702,1,0,2,10,25.0,18,3,0


In [5]:
df1.isna().any()

ID                         False
Estimated_Insects_Count    False
Crop_Type                  False
Soil_Type                  False
Pesticide_Use_Category     False
Number_Doses_Week          False
Number_Weeks_Used          False
Number_Weeks_Quit          False
Season                     False
Crop_Damage                False
dtype: bool

In [6]:
df1['Crop_Damage'].value_counts()

0    74238
1    12307
2     2313
Name: Crop_Damage, dtype: int64

In [7]:
import random

# Dealing with class imbalance

In [38]:
a=df1[df1['Crop_Damage']==0].index
b=df1[df1['Crop_Damage']==1].index
c=df1[df1['Crop_Damage']==2].index


random.seed(0)

a=np.random.choice(a,size=len(c))
b=np.random.choice(b,size=len(c))
c=np.asarray(c)

new_indexes=np.concatenate((a,b,c))

df1_=df1.loc[new_indexes,]
df1_

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
13501,F00023689,448,0,0,2,30,16.0,0,3,0
10275,F00018150,1132,0,0,3,45,43.0,0,1,0
53883,F00094521,448,1,1,3,5,10.0,0,3,0
52824,F00092685,1678,0,1,2,20,45.0,3,2,0
5094,F00008960,2840,1,0,2,10,28.0,3,2,0
...,...,...,...,...,...,...,...,...,...,...
88470,F00155221,3165,0,1,2,40,40.0,18,1,2
88479,F00155242,3516,0,1,2,25,50.0,4,3,2
88493,F00155265,3896,0,1,2,60,48.0,10,1,2
88552,F00155357,448,1,0,2,20,20.0,11,2,2


In [39]:
X=df1_.drop(columns=['ID','Crop_Damage'])
X

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
13501,448,0,0,2,30,16.0,0,3
10275,1132,0,0,3,45,43.0,0,1
53883,448,1,1,3,5,10.0,0,3
52824,1678,0,1,2,20,45.0,3,2
5094,2840,1,0,2,10,28.0,3,2
...,...,...,...,...,...,...,...,...
88470,3165,0,1,2,40,40.0,18,1
88479,3516,0,1,2,25,50.0,4,3
88493,3896,0,1,2,60,48.0,10,1
88552,448,1,0,2,20,20.0,11,2


In [40]:
y=df1_['Crop_Damage']
y

13501    0
10275    0
53883    0
52824    0
5094     0
        ..
88470    2
88479    2
88493    2
88552    2
88741    2
Name: Crop_Damage, Length: 6939, dtype: int64

In [41]:
X.describe()

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
count,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0
mean,1606.279579,0.279291,0.44113,2.401211,25.43306,32.6426,6.808618,1.896815
std,879.981194,0.448683,0.496558,0.567069,14.960425,13.193909,9.211348,0.696336
min,150.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,915.0,0.0,0.0,2.0,15.0,25.0,0.0,1.0
50%,1478.0,0.0,0.0,2.0,20.0,32.0,1.0,2.0
75%,2139.0,1.0,1.0,3.0,35.0,42.0,12.0,2.0
max,4097.0,1.0,1.0,3.0,95.0,67.0,48.0,3.0


# Scaling X

In [42]:
s=MinMaxScaler()

In [43]:
new_X=pd.DataFrame(s.fit_transform(X))


In [44]:
new_X.columns=X.columns
new_X

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
0,0.075500,0.0,0.0,0.5,0.315789,0.238806,0.000000,1.0
1,0.248797,0.0,0.0,1.0,0.473684,0.641791,0.000000,0.0
2,0.075500,1.0,1.0,1.0,0.052632,0.149254,0.000000,1.0
3,0.387129,0.0,1.0,0.5,0.210526,0.671642,0.062500,0.5
4,0.681530,1.0,0.0,0.5,0.105263,0.417910,0.062500,0.5
...,...,...,...,...,...,...,...,...
6934,0.763871,0.0,1.0,0.5,0.421053,0.597015,0.375000,0.0
6935,0.852800,0.0,1.0,0.5,0.263158,0.746269,0.083333,1.0
6936,0.949075,0.0,1.0,0.5,0.631579,0.716418,0.208333,0.0
6937,0.075500,1.0,0.0,0.5,0.210526,0.298507,0.229167,0.5


In [45]:
new_X.describe()

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
count,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0,6939.0
mean,0.368959,0.279291,0.44113,0.700605,0.267716,0.487203,0.141846,0.448408
std,0.222949,0.448683,0.496558,0.283534,0.157478,0.196924,0.191903,0.348168
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.193818,0.0,0.0,0.5,0.157895,0.373134,0.0,0.0
50%,0.336458,0.0,0.0,0.5,0.210526,0.477612,0.020833,0.5
75%,0.503927,1.0,1.0,1.0,0.368421,0.626866,0.25,0.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
y.describe()

count    6939.000000
mean        1.000000
std         0.816555
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max         2.000000
Name: Crop_Damage, dtype: float64

In [47]:
X_train,X_test,y_train,y_test=train_test_split(new_X,y,test_size=0.2)


# Naive Bayes

In [48]:
from sklearn.naive_bayes import GaussianNB

In [49]:
model=GaussianNB()

In [50]:
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [51]:
prediction=model.predict(X_test)
prediction=pd.DataFrame({'pred':prediction})
prediction['pred'].value_counts()

2    757
0    473
1    158
Name: pred, dtype: int64

In [52]:
model.score(X_test,y_test)

0.5115273775216138

In [53]:
final_model=GaussianNB()

In [54]:
final_model.fit(new_X,y)

GaussianNB(priors=None, var_smoothing=1e-09)

# Importing test.csv and performing EDA

In [55]:
df_test=pd.read_csv('test.csv')

In [56]:
df_test

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
0,F00000002,188,1,1,1,0,,0,2
1,F00000007,410,1,1,1,0,0.0,0,2
2,F00000011,626,1,0,1,0,0.0,0,2
3,F00000013,731,1,0,1,0,0.0,0,2
4,F00000014,789,0,0,1,0,0.0,0,1
...,...,...,...,...,...,...,...,...,...
59305,F00155937,3337,1,0,2,20,34.0,12,1
59306,F00155940,3516,1,0,2,20,32.0,10,2
59307,F00155941,3702,1,0,2,10,,48,1
59308,F00155943,3702,1,0,2,10,28.0,17,2


In [57]:
df_test.isna().sum()

ID                            0
Estimated_Insects_Count       0
Crop_Type                     0
Soil_Type                     0
Pesticide_Use_Category        0
Number_Doses_Week             0
Number_Weeks_Used          5893
Number_Weeks_Quit             0
Season                        0
dtype: int64

In [58]:
df_test.describe()

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
count,59310.0,59310.0,59310.0,59310.0,59310.0,53417.0,59310.0,59310.0
mean,1397.014129,0.287068,0.455117,2.263227,25.85188,28.705094,9.528376,1.900877
std,849.425808,0.452397,0.497986,0.461733,15.481957,12.421211,9.855341,0.701934
min,150.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,731.0,0.0,0.0,2.0,15.0,20.0,0.0,1.0
50%,1212.0,0.0,0.0,2.0,20.0,28.0,7.0,2.0
75%,1898.0,1.0,1.0,3.0,40.0,38.0,16.0,2.0
max,4097.0,1.0,1.0,3.0,95.0,67.0,50.0,3.0


In [59]:
df_test1=df_test.fillna(df_test['Number_Weeks_Used'].median())
df_test1

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
0,F00000002,188,1,1,1,0,28.0,0,2
1,F00000007,410,1,1,1,0,0.0,0,2
2,F00000011,626,1,0,1,0,0.0,0,2
3,F00000013,731,1,0,1,0,0.0,0,2
4,F00000014,789,0,0,1,0,0.0,0,1
...,...,...,...,...,...,...,...,...,...
59305,F00155937,3337,1,0,2,20,34.0,12,1
59306,F00155940,3516,1,0,2,20,32.0,10,2
59307,F00155941,3702,1,0,2,10,28.0,48,1
59308,F00155943,3702,1,0,2,10,28.0,17,2


In [60]:
df_test1.isna().any()

ID                         False
Estimated_Insects_Count    False
Crop_Type                  False
Soil_Type                  False
Pesticide_Use_Category     False
Number_Doses_Week          False
Number_Weeks_Used          False
Number_Weeks_Quit          False
Season                     False
dtype: bool

In [61]:
df_test1=df_test1.drop(columns=['ID'])
df_test1

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
0,188,1,1,1,0,28.0,0,2
1,410,1,1,1,0,0.0,0,2
2,626,1,0,1,0,0.0,0,2
3,731,1,0,1,0,0.0,0,2
4,789,0,0,1,0,0.0,0,1
...,...,...,...,...,...,...,...,...
59305,3337,1,0,2,20,34.0,12,1
59306,3516,1,0,2,20,32.0,10,2
59307,3702,1,0,2,10,28.0,48,1
59308,3702,1,0,2,10,28.0,17,2


In [62]:
s1=MinMaxScaler()

In [63]:
test_X=pd.DataFrame(s1.fit_transform(df_test1))


In [64]:
test_X.columns=df_test1.columns
test_X

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
0,0.009628,1.0,1.0,0.0,0.000000,0.417910,0.00,0.5
1,0.065873,1.0,1.0,0.0,0.000000,0.000000,0.00,0.5
2,0.120598,1.0,0.0,0.0,0.000000,0.000000,0.00,0.5
3,0.147200,1.0,0.0,0.0,0.000000,0.000000,0.00,0.5
4,0.161895,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0
...,...,...,...,...,...,...,...,...
59305,0.807449,1.0,0.0,0.5,0.210526,0.507463,0.24,0.0
59306,0.852800,1.0,0.0,0.5,0.210526,0.477612,0.20,0.5
59307,0.899924,1.0,0.0,0.5,0.105263,0.417910,0.96,0.0
59308,0.899924,1.0,0.0,0.5,0.105263,0.417910,0.34,0.5


In [65]:
final_predictions=final_model.predict(test_X)
final_predictions

array([2, 0, 2, ..., 1, 1, 2], dtype=int64)

In [66]:
final_df=df_test[['ID']]
final_df

Unnamed: 0,ID
0,F00000002
1,F00000007
2,F00000011
3,F00000013
4,F00000014
...,...
59305,F00155937
59306,F00155940
59307,F00155941
59308,F00155943


In [67]:
final_df['Crop_Damage']=final_predictions
final_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,Crop_Damage
0,F00000002,2
1,F00000007,0
2,F00000011,2
3,F00000013,2
4,F00000014,1
...,...,...
59305,F00155937,1
59306,F00155940,1
59307,F00155941,1
59308,F00155943,1


In [68]:
final_df.to_csv('new_submission.csv',index=False)