# 0. Import Library

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns

import scipy.stats

![](https://onlinelibrary.wiley.com/cms/asset/ea1d3bd8-afd7-4914-b645-74d424b6690d/advs3654-fig-0002-m.jpg)

# 1. Input Data

In [None]:
df = pd.read_csv('data/fruit.csv')

# 2. Data Preprocessing

## 2.1 Exploratory Data Analysis (EDA)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes

## 2.2 Data Cleaning

### - Missing Value

In [None]:
df.isnull().sum()

### - Inconsistent Data

In [None]:
df.dtypes

In [None]:
df.groupby('fruit')['fruit'].count()

### - Outliers

In [None]:
df.describe()

## 2.3 Data Transformation

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
def labelEncode(data, columns):
    columns_ = []
    for i in columns:
        lb = LabelEncoder()
        lb_ = lb.fit_transform(data[i].astype(str))
        data[i+'_Encoded']=lb_
        columns_.append(i+'_Encoded')
    return columns_,lb

In [None]:
columns = ['fruit']
enc_columns, lb=labelEncode(df, columns)

In [None]:
df.sample(5)

In [None]:
for feat in enc_columns:
    print(feat[:-8])
    print(' ',df[feat[:-8]].unique())
    print(' ',df[feat].unique())
    print('  -------')

## [Optional] Export เป็นไฟล์เมื่อทำ Data Cleaning เสร็จแล้ว

In [None]:
#data.to_csv('รหัส.csv',index=False)
#data.to_excel('รหัส.xlsx',index=False)

## 2.4 ตรวจสอบค่าสหสัมพันธ์ของตัวแปร x, y

In [None]:
df.corr()

In [None]:
plt.rcParams['figure.figsize'] = 10,7 
sns.heatmap(df.corr(), annot=True);

## 2.5 การกำหนด Feature / Target

In [None]:
DataMatrix = df.values
DataMatrix.shape

In [None]:
feature_cols = ['size']
X = df[feature_cols].values
Y = df.fruit_Encoded.values

## 2.6 Data Preparation (แบ่งข้อมูลสำหรับ Training / Testing)

In [None]:
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.4, random_state=0)

# 3. Modelling: Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

## 3.1 Training Data

In [None]:
model = DecisionTreeClassifier(criterion='entropy')

In [None]:
model.fit(X_Train, Y_Train)

In [None]:
model.score(X, Y)

## 3.2 Predict Data

In [None]:
y_predict = model.predict(X_Test)

In [None]:
y_predict_class = lb.inverse_transform(y_predict)

In [None]:
for res, label in zip(y_predict, y_predict_class):
    print('Result: {} - {}'.format(res, label))

## 3.3 การแสดงกราฟ Decision Tree

In [None]:
import pydotplus
from sklearn.tree import export_graphviz
from IPython.display import Image

In [None]:
data_feature_names = ['size']

In [None]:
dot_data = export_graphviz(model, out_file=None, 
                           feature_names=data_feature_names,  
                           class_names=y_predict_class,  
                           filled=True,
                           rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())   
graph.write_png("dtree_fruit.png")

In [None]:
from IPython.display import display
from PIL import Image

path="dtree_fruit.png"
display(Image.open(path))

# 4. Model Evaluation

![](https://scontent.fbkk29-6.fna.fbcdn.net/v/t1.6435-9/99275955_3331806336852736_3844604936671199232_n.png?_nc_cat=109&ccb=1-7&_nc_sid=7f8c78&_nc_eui2=AeEPpdFqjY0zIBpWW6O52JXQCwjvKa4EPMcLCO8prgQ8x2YhylNrAmxFW2vPQB1sm2JXvyP_u9jYqj85NHKBdyh5&_nc_ohc=-XjmPNGvhvUAX_I9Z5f&_nc_ht=scontent.fbkk29-6.fna&oh=00_AfBY6aE1We6FnZidwobWCxZjP-uEUowhlA39STrUEJym2g&oe=6502682A)

### 4.1 Split Test

In [None]:
from sklearn import metrics

In [None]:
cnf_matrix = metrics.confusion_matrix(Y_Test, y_predict)
cnf_matrix

In [None]:
plt.rcParams['figure.figsize'] = 10,7 
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cnf_matrix, display_labels = ['lemon', 'orange'])
cm_display.plot()
plt.show()

In [None]:
from sklearn.metrics import classification_report
target_names = ['lemon', 'orange']
print(classification_report(Y_Test, y_predict, target_names=target_names))

### 4.2 Cross Validation Test

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cvs = cross_val_score(model, X, Y, cv=2)

In [None]:
print('Cross Validation Score {}'.format(cvs))

In [None]:
print('Mean: {:4f}'.format(cvs.mean()))

# [Optional] Save Machine Learning Models

In [None]:
#import pickle
#filename = 'pickle_รหัสนศ.sav'
#pickle.dump(model, open(filename, 'wb'))