In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data set details
The dataset consists approximately 1,584 images of leaf specimens (16 samples each of 99 species) which have been converted to binary black leaves against white backgrounds. Three sets of features are also provided per image: a shape contiguous descriptor, an interior texture histogram, and a ﬁne-scale margin histogram. For each feature, a 64-attribute vector is given per leaf sample.

Note that of the original 100 species, we have eliminated one on account of incomplete associated data in the original dataset.

# Data fields

id - an anonymous id unique to an image

margin_1, margin_2, margin_3, ..., margin_64 - each of the 64 attribute vectors for the margin feature

shape_1, shape_2, shape_3, ..., shape_64 - each of the 64 attribute vectors for the shape feature

texture_1, texture_2, texture_3, ..., texture_64 - each of the 64 attribute vectors for the texture feature

In [None]:
train_data = pd.read_csv('/kaggle/input/leaf-classification/train.csv.zip',index_col='id')
test_data = pd.read_csv('/kaggle/input/leaf-classification/test.csv.zip')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
test_id=test_data.id
test_data = test_data.drop(['id'], axis =1)

In [None]:
test_data.head()

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.isnull().any().sum()

In [None]:
test_data.isnull().any().sum()

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data['species'].nunique()

# Species is our categorical column which is our Target column

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder().fit(train_data.species)
labels=le.transform(train_data.species)
classes=list(le.classes_)

In [None]:
classes

In [None]:
labels

In [None]:
X=train_data.drop(['species'],axis=1).values
Y=labels

In [None]:
X

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=1,shuffle=True,stratify=Y)

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier2=RandomForestClassifier(n_estimators = 40,n_jobs=4)
classifier2.fit(x_train,y_train)

In [None]:
classifier2.score(x_test,y_test)

In [None]:
y_pred2=classifier2.predict_proba(x_test)


In [None]:
y_pred2

In [None]:
sample_data = pd.read_csv('/kaggle/input/leaf-classification/sample_submission.csv.zip',index_col='id')
sample_data.head()

In [None]:
final_pred=classifier2.predict_proba(test_data) # final prediction on test_data

In [None]:
final_pred

In [None]:
submission = pd.DataFrame(final_pred, columns=classes)
submission.insert(0, 'id', test_id)
submission.reset_index()

In [None]:
submission.to_csv('submission.csv', index = False)