In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data set details
The dataset consists approximately 1,584 images of leaf specimens (16 samples each of 99 species) which have been converted to binary black leaves against white backgrounds. Three sets of features are also provided per image: a shape contiguous descriptor, an interior texture histogram, and a ﬁne-scale margin histogram. For each feature, a 64-attribute vector is given per leaf sample.

Note that of the original 100 species, we have eliminated one on account of incomplete associated data in the original dataset.

# Data fields

id - an anonymous id unique to an image

margin_1, margin_2, margin_3, ..., margin_64 - each of the 64 attribute vectors for the margin feature

shape_1, shape_2, shape_3, ..., shape_64 - each of the 64 attribute vectors for the shape feature

texture_1, texture_2, texture_3, ..., texture_64 - each of the 64 attribute vectors for the texture feature

In [None]:
train_data = pd.read_csv('/kaggle/input/leaf-classification/train.csv.zip',index_col='id')
test_data = pd.read_csv('/kaggle/input/leaf-classification/test.csv.zip')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
test_id=test_data.id
test_data = test_data.drop(['id'], axis =1)

In [None]:
test_data.head()

# VIsualizing leaf images'

In [None]:
import zipfile
with zipfile.ZipFile('/kaggle/input/leaf-classification/images.zip') as z_img:
    z_img.extractall()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15))
import cv2 as cv
from keras.preprocessing.image import load_img
for i in range(25):
    j=np.random.choice((os.listdir('images')))
    plt.subplot(5,5,i+1)
    img=load_img(os.path.join('/kaggle/working/images',j))
    plt.imshow(img)

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.isnull().any().sum()

In [None]:
test_data.isnull().any().sum()

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data['species'].nunique()

# Species is our categorical column which is our Target column

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
le=encoder.fit(train_data.species)
labels=le.transform(train_data.species)
classes=list(le.classes_)

In [None]:
classes

In [None]:
labels

In [None]:
X=train_data.drop(['species'],axis=1).values
Y=labels

In [None]:
X

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=1,shuffle=True,stratify=Y)

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier1=DecisionTreeClassifier(criterion='entropy')
classifier1.fit(x_train,y_train)

In [None]:
classifier1.score(x_train,y_train)

In [None]:
classifier1.score(x_test,y_test)

we can clearly see that decision tree is overfitted

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier2=RandomForestClassifier(n_estimators = 40,n_jobs=4)
classifier2.fit(x_train,y_train)

In [None]:
classifier2.score(x_test,y_test)

In [None]:
y_pred2=classifier2.predict_proba(x_test)


In [None]:
y_pred2

In [None]:
sample_data = pd.read_csv('/kaggle/input/leaf-classification/sample_submission.csv.zip',index_col='id')
sample_data.head()

In [None]:
final_pred=classifier2.predict_proba(test_data) # final prediction on test_data

In [None]:
final_pred

In [None]:
submission = pd.DataFrame(final_pred, columns=classes)
submission.insert(0, 'id', test_id)
submission.reset_index()

In [None]:
submission.to_csv('submission.csv', index = False)