In [None]:
"""
Iris data set consists of 150 samples
where there are 3 classes: "Iris-Setosa",
"Iris-Versicolor", and "Iris-Virginica"

4 features/attributes contibute to classifying:
sepal-length, sepal-width, petal-length, petal-width
"""

In [49]:
#load dataset from sklearn library
#load_iris has data and class label for each sample
from sklearn.datasets import load_iris
#import in-built splitting function that is in sklearn
from sklearn.model_selection import train_test_split
#import numpy
import numpy as np
#import pandas for data analysis
%pylab
import pandas as pd
#pyplot for data visualization
import matplotlib.pyplot as plt

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [3]:
#extract data and store in variable
#'data' is a numpy array of shapes (150,4) and has 150 samples. Each class is 50 sample
data = load_iris().data

In [4]:
data.shape

(150, 4)

In [5]:
#extract class labels
labels = load_iris().target

In [6]:
#see labels
labels.shape

(150,)

In [8]:
#need to combine data and class labels
#data is a 2-d array, we need to reshape labels to be a 2-d array
labels = np.reshape(labels, (150,1))

In [9]:
#concatenate using axis = -1 which will combine based on second dimension
data = np.concatenate([data,labels],axis=-1)

In [10]:
data.shape

(150, 5)

In [13]:
#features
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'species']
#create dataframe (excel-like table)
dataset = pd.DataFrame(data,columns=names)

In [14]:
dataset.head(5)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [19]:
#need to change the species feature from numerical to flower names
dataset['species'].replace(0, 'Iris-setosa', inplace=True)
dataset['species'].replace(1, 'Iris-versicolor', inplace=True)
dataset['species'].replace(2, 'Iris-virginica', inplace=True)

In [20]:
dataset.head(5)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [29]:
#visualizing sepal data
#set up visualizing our data (scatterplot)
plt.figure(4,figsize(10,8))

plt.scatter(data[:50,0], data[:50,1], c='r', label = 'Iris-setosa')
plt.scatter(data[50:100,0], data[50:100,1], c='g', label = 'Iris-versicolor')
plt.scatter(data[100:,0], data[100:,1], c='b', label = 'Iris-virginica')

plt.xlabel('Sepal length',fontsize=20)
plt.ylabel('Sepal width',fontsize=20)

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

plt.title('Sepal length vs Sepal width',fontsize=20)

plt.legend(prop={'size':18})

#show plot
plt.show()

In [31]:
#visualizing petal data
#plot the graph for petal length and width
plt.figure(4,figsize=(8,8))

plt.scatter(data[:50,2], data[:50,3], c='r', label = 'Iris-setosa')
plt.scatter(data[50:100,2], data[50:100,3], c='g', label = 'Iris-versicolor')
plt.scatter(data[100:,2], data[100:,3], c='b', label = 'Iris-virginica')

plt.xlabel('Petal length',fontsize=15)
plt.ylabel('Petal width',fontsize=15)

plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.title('Petal length vs Petal width',fontsize=15)

plt.legend(prop={'size':20})

plt.show()

In [33]:
#show the correlation between petal length and width from previous plot showcased
dataset.iloc[:,2:].corr()

#analyze correlation between all 3 species individually
dataset.iloc[:50,:].corr() #setosa

#visualize the feature distributing through plotting
fig = plt.figure(figsize=(8,8))
ax = fig.gca()
dataset.hist(ax=ax)
plt.show()

In [34]:
#analyze if 4 attributes are on same scale
dataset.describe()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [35]:
"""
Focus on preprocessing data. Do it doing two ways:
normalizing data and splitting the data into training and
training sets

"""

'\nFocus on preprocessing data. Do it doing two ways:\nnormalizing data and splitting the data into training and\ntraining sets\n'

In [50]:
#Splitting the data
#load the data
train_data,test_data,test_label = train_test_split(dataset.iloc[:,:3], dataset.iloc[:,4], test_size=0.2, randomstate=42)

TypeError: Invalid parameters passed: {'randomstate': 42}