In [1]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
                names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])

In [3]:
# Get info of the data
print(df.info())
print('')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
class           150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB
None



In [4]:
# Get statistical summary of the data
print(df.describe())
print('')


       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000



In [5]:
print(df.head(10))


   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
5           5.4          3.9           1.7          0.4  Iris-setosa
6           4.6          3.4           1.4          0.3  Iris-setosa
7           5.0          3.4           1.5          0.2  Iris-setosa
8           4.4          2.9           1.4          0.2  Iris-setosa
9           4.9          3.1           1.5          0.1  Iris-setosa


In [6]:
# Select rows with sepal_length more than 5.0
# the loc command allows to access a grou of rows an columns
df2 = df.loc[df['sepal_length'] > 5.0, ]
print(df2.head(10))


    sepal_length  sepal_width  petal_length  petal_width        class
0            5.1          3.5           1.4          0.2  Iris-setosa
5            5.4          3.9           1.7          0.4  Iris-setosa
10           5.4          3.7           1.5          0.2  Iris-setosa
14           5.8          4.0           1.2          0.2  Iris-setosa
15           5.7          4.4           1.5          0.4  Iris-setosa
16           5.4          3.9           1.3          0.4  Iris-setosa
17           5.1          3.5           1.4          0.3  Iris-setosa
18           5.7          3.8           1.7          0.3  Iris-setosa
19           5.1          3.8           1.5          0.3  Iris-setosa
20           5.4          3.4           1.7          0.2  Iris-setosa


In [7]:
marker_shapes = ['.', '^','*']

In [8]:
for i, species in enumerate(df['class'].unique()):
  if i == 0:
    ax = df[df['class'] == species].plot.scatter(x='sepal_length', y='sepal_width', marker=marker_shapes[i], s=100,title="Sepal Width vs Length by Species", label=species, figsize=(10,7))
  else:
    df[df['class'] == species].plot.scatter(x='sepal_length', y='sepal_width', marker=marker_shapes[i], s=100, title="Sepal Width vs Length by Species", label=species, ax=ax)
# plt.show()


In [9]:
df['petal_length'].plot.hist(title = 'Histogram of Petal length')
# plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x1b5ac746f28>

In [10]:
# df.plot.box(title = 'Boxplot length and width of sepal and length and width of petal')

## Encoding of categorical var

### One common way to convert these categorical variables into numerical variables is a technique known as one-hot encoding, implemented by the get_dummies() function in pandas

In [15]:
df2 = pd.DataFrame({'Day': ['Monday','Tuesday','Wednesday',
                           'Thursday','Friday','Saturday',
                           'Sunday']})

print(pd.get_dummies(df2))
print('')
                           

   Day_Friday  Day_Monday  Day_Saturday  Day_Sunday  Day_Thursday  \
0           0           1             0           0             0   
1           0           0             0           0             0   
2           0           0             0           0             0   
3           0           0             0           0             1   
4           1           0             0           0             0   
5           0           0             1           0             0   
6           0           0             0           1             0   

   Day_Tuesday  Day_Wednesday  
0            0              0  
1            1              0  
2            0              1  
3            0              0  
4            0              0  
5            0              0  
6            0              0  



In [16]:
# Imputing missing values
# Import the iris data once again
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
                names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])

In [17]:
random_index = np.random.choice(df.index, replace = False, size = 10)
df.loc[random_index, 'sepal_length'] = None

In [18]:
# Check where the missing values are
print(df.isnull().any())

sepal_length     True
sepal_width     False
petal_length    False
petal_width     False
class           False
dtype: bool


In [19]:
# Drop missing values
print("Number of rows before deleting: %d" % (df.shape[0]))
df2 = df.dropna()
print("Number of rows after deleting: %d" % (df2.shape[0]))
print('')


Number of rows before deleting: 150
Number of rows after deleting: 140

