In [71]:
# Import necessary modules

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

## Glass Prediction

This task aims to predict the types of glass according to the amount of various types of elements it is composed of.

## Context

This is a Glass Identification Data Set from UCI. It contains 10 attributes including id. The response is glass type(discrete 7 values)

## Content
Attribute Information:

Id number: 1 to 214 (removed from CSV file)

RI: refractive index

Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)

Mg: Magnesium

Al: Aluminum

Si: Silicon

K: Potassium

Ca: Calcium

Ba: Barium

Fe: Iron

## Type of glass: (class attribute)

-- 1 buildingwindowsfloatprocessed -- 2 buildingwindowsnonfloatprocessed -- 3 vehiclewindowsfloatprocessed

-- 4 vehiclewindowsnonfloatprocessed (none in this database)

-- 5 containers

-- 6 tableware

-- 7 headlamps

#### First we take a look at the data and see what we have to work with. At first glance it is very clean.

In [6]:
data = pd.read_csv('glass.csv')

data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


#### Looking at the type of the data in each column

In [17]:
for columns in data:
    print(columns,':', type(data[columns].values[0]))

RI : <class 'numpy.float64'>
Na : <class 'numpy.float64'>
Mg : <class 'numpy.float64'>
Al : <class 'numpy.float64'>
Si : <class 'numpy.float64'>
K : <class 'numpy.float64'>
Ca : <class 'numpy.float64'>
Ba : <class 'numpy.float64'>
Fe : <class 'numpy.float64'>
Type : <class 'numpy.int64'>


#### Looking good, so lets see whether there're any holes in the data

In [18]:
data.isnull().any()

RI      False
Na      False
Mg      False
Al      False
Si      False
K       False
Ca      False
Ba      False
Fe      False
Type    False
dtype: bool

#### Brief summary of the data

In [19]:
data.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


#### As there are no missing values, as well as no irrelevant features, we can proceed with the task immediately.

In [48]:
x = data.iloc[:,:-1]; y = data.iloc[:,9]

In [49]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [79]:
for each in ['gini','entropy']:
    for x in range(1,12):

        option = [('scale',StandardScaler()),
                  ('polynomial',PolynomialFeatures(include_bias=False)),
                  ('model',DecisionTreeClassifier(criterion=each,max_depth=x))]

        pipe = Pipeline(option)
        pipe.fit(x_train, y_train)

        print(each, x, pipe.score(x_test,y_test))

gini 1 0.42592592592592593
gini 2 0.5185185185185185
gini 3 0.6111111111111112
gini 4 0.5740740740740741
gini 5 0.5740740740740741
gini 6 0.5555555555555556
gini 7 0.5925925925925926
gini 8 0.5370370370370371
gini 9 0.5925925925925926
gini 10 0.5925925925925926
gini 11 0.5925925925925926
entropy 1 0.37037037037037035
entropy 2 0.48148148148148145
entropy 3 0.5
entropy 4 0.5370370370370371
entropy 5 0.48148148148148145
entropy 6 0.5555555555555556
entropy 7 0.5
entropy 8 0.5
entropy 9 0.5370370370370371
entropy 10 0.5
entropy 11 0.5370370370370371


In [82]:
option = [('scale',StandardScaler()),('polynomial',PolynomialFeatures(include_bias=False)),('model',DecisionTreeClassifier(criterion='gini',max_depth=3))]

pipe = Pipeline(option)
pipe.fit(x_train, y_train)

pipe.score(x_train,y_train)

0.71875