In [1]:
# importing Modules
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# importing training file
df = pd.read_excel('train.xlsx')

In [3]:
df.head()

Unnamed: 0,ID,Insects,Crop,Soil,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season,Crop_status
0,1,188,Feed,clay,1,0,0.0,0,1,0
1,2,209,Feed,clay,1,0,0.0,0,2,1
2,3,257,Feed,clay,1,0,0.0,0,2,1
3,4,257,Feed,silt,1,0,0.0,0,2,1
4,5,342,Feed,clay,1,0,0.0,0,2,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          80000 non-null  int64  
 1   Insects                     80000 non-null  int64  
 2   Crop                        80000 non-null  object 
 3   Soil                        80000 non-null  object 
 4   Category_of_Toxicant        80000 non-null  int64  
 5   Does_count                  80000 non-null  int64  
 6   Number_of_Weeks_Used        71945 non-null  float64
 7   Number_Weeks_does_not used  80000 non-null  int64  
 8   Season                      80000 non-null  int64  
 9   Crop_status                 80000 non-null  int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 6.1+ MB


In [5]:
#treating Missing Values
df['Number_of_Weeks_Used'].fillna(df.Number_of_Weeks_Used.median(), inplace=True)

In [6]:
df.isnull().sum()

ID                            0
Insects                       0
Crop                          0
Soil                          0
Category_of_Toxicant          0
Does_count                    0
Number_of_Weeks_Used          0
Number_Weeks_does_not used    0
Season                        0
Crop_status                   0
dtype: int64

In [7]:
#creating dummy variables
df2 = pd.get_dummies(df, columns=['Crop', 'Soil'])

In [8]:
df2.head()

Unnamed: 0,ID,Insects,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season,Crop_status,Crop_Feed,Crop_Food,Soil_clay,Soil_silt
0,1,188,1,0,0.0,0,1,0,1,0,1,0
1,2,209,1,0,0.0,0,2,1,1,0,1,0
2,3,257,1,0,0.0,0,2,1,1,0,1,0
3,4,257,1,0,0.0,0,2,1,1,0,0,1
4,5,342,1,0,0.0,0,2,1,1,0,1,0


In [9]:
ext = df2[df2.Crop_status == 2]

In [10]:
#updampling data
df3 = pd.concat([df2,ext,ext,ext,ext])

In [11]:
df3.Crop_status.value_counts()

0    66716
1    11183
2    10505
Name: Crop_status, dtype: int64

In [12]:
#dounsampling step
from sklearn.utils import resample

# Separating majority and minority classes
df_major = df3[df3.Crop_status==1]
df_minor= df3[df3.Crop_status==2]
 
# Downsample major class
df_majority_downsampled = resample(df_major,replace=False,n_samples=len(df_minor),random_state=42)
 
# Combining minor class with downsampled major class
dff = pd.concat([df_majority_downsampled, df_minor])
 
# Display new class counts
dff.Crop_status.value_counts(normalize = True)
dff.reset_index(inplace = True)
dff.drop(['index'], axis = 1, inplace = True)

#dounsampling step
from sklearn.utils import resample

# Separating majority and minority classes
df_major = df3[df3.Crop_status==0]
df_minor= dff[dff.Crop_status==2]
 
# Downsample major class
df_majority_downsampled = resample(df_major,replace=False,n_samples=len(df_minor),random_state=42)
 
# Combining minor class with downsampled major class
df4 = pd.concat([df_majority_downsampled,dff])
 
# Display new class counts
df4.Crop_status.value_counts(normalize = True)
df4.reset_index(inplace = True)
df4.drop(['index'], axis = 1, inplace = True)

In [13]:
df4.head()

Unnamed: 0,ID,Insects,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season,Crop_status,Crop_Feed,Crop_Food,Soil_clay,Soil_silt
0,58234,851,2,10,24.0,12,2,0,1,0,1,0
1,39048,851,2,20,33.0,4,1,0,0,1,1,0
2,28915,1296,2,80,45.0,3,2,0,0,1,1,0
3,43924,1297,3,80,43.0,0,2,0,1,0,0,1
4,41224,448,3,20,26.0,0,1,0,1,0,1,0


In [14]:
df4.shape

(31515, 12)

In [15]:
df4.Crop_status.value_counts(normalize = True)

2    0.333333
1    0.333333
0    0.333333
Name: Crop_status, dtype: float64

In [16]:
df4.Crop_status.value_counts()

2    10505
1    10505
0    10505
Name: Crop_status, dtype: int64

In [17]:
X= df4.drop(['ID','Crop_status'],axis = 1)

In [18]:
y = df4['Crop_status']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0) 

In [20]:
st= StandardScaler()
X_train = st.fit_transform(X_train)
X_test  = st.transform(X_test)

In [21]:
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train) 
dtree_predictions = dtree_model.predict(X_test) 

In [22]:
accuracy = dtree_model.score(X_test, y_test)
accuracy

0.5076786394212464

In [23]:
cm = confusion_matrix(y_test, dtree_predictions)
cm

array([[1138,  907,  522],
       [ 383, 1009, 1258],
       [ 242,  567, 1853]], dtype=int64)

In [24]:
# training a KNN classifier 
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 7).fit(X_train, y_train) 
accuracy = knn.score(X_test, y_test) 
accuracy

0.5721538266277446

In [25]:
forest = RandomForestClassifier(max_depth = 2, random_state = 42).fit(X_train, y_train) 
forest_predictions = forest.predict(X_test) 
accuracy = dtree_model.score(X_test, y_test)
accuracy

0.5076786394212464

In [26]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42).fit(X_train,y_train)
dt.score(X_test,y_test)*100

69.53928163472521

In [27]:
from sklearn.ensemble import RandomForestClassifier 
forest = RandomForestClassifier(random_state = 42).fit(X_train, y_train)
forest.score(X_test,y_test)*100

73.85455006980581

In [28]:
#since the decision tree and random forest are giving best scores with its default parameter i decided to go with both the models
#so i used voting classifier

In [29]:
vot = VotingClassifier(estimators=[('dt',dt),('Foresst',forest)],voting='hard').fit(X_train, y_train)
vot.score(X_test,y_test)*100

74.59068409696663

In [30]:
# importing test files

In [31]:
test_df=pd.read_excel('test.xlsx')
test_df.head()

Unnamed: 0,ID,Insects,Crop,Soil,Category_of_Toxicant,Does_count,Number_of_Weeks_Used,Number_Weeks_does_not used,Season
0,1,188,Feed,silt,1,0,,0,2
1,2,410,Feed,silt,1,0,0.0,0,2
2,3,626,Feed,clay,1,0,0.0,0,2
3,4,731,Feed,clay,1,0,0.0,0,2
4,5,789,Food,clay,1,0,0.0,0,1


In [32]:
test_df['Number_of_Weeks_Used'].fillna(test_df.Number_of_Weeks_Used.median(), inplace=True)
test_df2 = pd.get_dummies(test_df, columns=['Crop', 'Soil'])

In [33]:
test_X = test_df2.drop(['ID'],axis = 1)

In [34]:
st= StandardScaler()
X = st.fit_transform(X)
test_X  = st.transform(test_X)

In [35]:
vot = VotingClassifier(estimators=[('dt',dt),('Foresst',forest)],voting='hard').fit(X, y)
vot_pred = vot.predict(test_X)
pred_vot = pd.Series(vot_pred)
pred_vot.value_counts()

0    24720
1     9262
2     1018
dtype: int64

In [None]:
pred_vot.to_csv('Attempt4(voth).csv')