In [2]:
# Common Imports
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
%matplotlib inline
# JUST TO MAKE SURE SOME WARNINGS ARE IGNORED 
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder          # encooding variables
from sklearn.preprocessing import StandardScaler        # encooding variables

### Create Dataframe:

In [50]:
# create record
student_info = {'id':[1,2,3,4,5],
'name':['John', 'Mary', 'Swapnil', 'Cody', 'Saeed'],
'grade':[9.3, 4.2, 3.2, 10, 10],
'status':['enrolled','unenrolled','enrolled',
          'enrolled','enrolled']}
df_student = pd.DataFrame(student_info) # convert to data frame
df_student.set_index('id', inplace=True) # reindex
df_student

Unnamed: 0_level_0,name,grade,status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,John,9.3,enrolled
2,Mary,4.2,unenrolled
3,Swapnil,3.2,enrolled
4,Cody,10.0,enrolled
5,Saeed,10.0,enrolled


### Import Data:

In [8]:
# read from csv
exampleData = pd.read_csv(
    filepath_or_buffer="./data/IRIS.csv",
    delimiter=",")

# add columns
# ordinal encoding method 1
le = LabelEncoder()
le.fit(exampleData["species"])
exampleData["speciesID1"] = le.transform(exampleData["species"])

# ordinal encoding method 2
exampleData["speciesID2"] = pd.factorize(exampleData["species"])[0]

# ordinal encoding method 3 (general method to reformat column)
ordering = ["Iris-virginica", "Iris-versicolor","Iris-setosa"] # define order
exampleData["speciesID3"] = pd.DataFrame([ordering.index(x) 
                                          for x in exampleData["species"]]).astype(int) # define type and add column

# ordinal encoding method 4 (replaces column)
# mapping = {'Iris-virginica': 1, 'Iris-versicolor': 2, 'Iris-setosa': 3} # define order
# exampleData.species = exampleData.o_resolution.map(resolution_mapping)

# categorical encoding - creates k-1 new columns, drop first as it can be inferred
encoding = pd.get_dummies(exampleData["species"], drop_first=True)
exampleData["Iris-versicolor"] = encoding["Iris-versicolor"]
exampleData["Iris-virginica"] = encoding["Iris-virginica"]

# delete column
del exampleData["speciesID1"]
del exampleData["speciesID2"] 
del exampleData["speciesID3"] 
#del exampleData["speciesID4"] 
del exampleData["Iris-versicolor"]
del exampleData["Iris-virginica"]
# alternatives:
#exampleData.drop(['Iris-versicolor','Iris-virginica'], axis = 1) 
#exampleData = exampleData.drop(exampleData.index[0:5]) # by index

# remove outliers that are more than 3 sd away from the mean
exampleData["petal_length_z_score"] = abs(stats.zscore(exampleData["petal_length"]))
f = exampleData["petal_length_z_score"] < 3
exampleData = exampleData.where(f)
del exampleData["petal_length_z_score"]

# replace null values with mean
exampleData["petal_length"].fillna(round(exampleData["petal_length"].mean(), 1), inplace=True)

# drop nulls
exampleData = exampleData.dropna()
exampleData = exampleData.reset_index(drop=True) # reset index after nulls removed

# get data
print(exampleData.iloc[:,0]) # by index
print(exampleData["sepal_length"]) # by name

# show first 5 rows
exampleData.head()

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal_length, Length: 150, dtype: float64
0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal_length, Length: 150, dtype: float64


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Statistical Analysis:

In [16]:
# basic statistics
print(exampleData["sepal_length"].count())
print(exampleData["sepal_length"].mean())
print(exampleData["sepal_length"].std())
print(exampleData["sepal_length"].min())
print(exampleData["sepal_length"].max())

# quantiles
print(exampleData["sepal_length"].quantile([0.25, 0.5, 0.75]).values)

# variance
print(exampleData["sepal_length"].var())

# correlation
print(exampleData.corr())
print(exampleData["sepal_length"].corr(exampleData["petal_length"]))

# unique categorical values
species = pd.unique(exampleData["species"])
print(f"Unique categorical values: {species}")
print(exampleData["species"].value_counts()) # get counts for each category

# overview of a dataset
exampleData.describe()

150
5.843333333333335
0.8280661279778629
4.3
7.9
[5.1 5.8 6.4]
0.6856935123042505
              sepal_length  sepal_width  petal_length  petal_width
sepal_length      1.000000    -0.109369      0.871754     0.817954
sepal_width      -0.109369     1.000000     -0.420516    -0.356544
petal_length      0.871754    -0.420516      1.000000     0.962757
petal_width       0.817954    -0.356544      0.962757     1.000000
0.8717541573048718
Unique categorical values: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
Iris-versicolor    50
Iris-virginica     50
Iris-setosa        50
Name: species, dtype: int64


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### Sort and Filter:

In [32]:
# print top 10 highest
top10 = exampleData.sort_values(by=['sepal_length'], ascending=False).head(10)
print(top10)

# filter by category
f = exampleData["species"] == "Iris-versicolor"
versicolorOnly = exampleData.where(f)
print(versicolorOnly.count())

     sepal_length  sepal_width  petal_length  petal_width         species
131           7.9          3.8           6.4          2.0  Iris-virginica
135           7.7          3.0           6.1          2.3  Iris-virginica
122           7.7          2.8           6.7          2.0  Iris-virginica
117           7.7          3.8           6.7          2.2  Iris-virginica
118           7.7          2.6           6.9          2.3  Iris-virginica
105           7.6          3.0           6.6          2.1  Iris-virginica
130           7.4          2.8           6.1          1.9  Iris-virginica
107           7.3          2.9           6.3          1.8  Iris-virginica
125           7.2          3.2           6.0          1.8  Iris-virginica
109           7.2          3.6           6.1          2.5  Iris-virginica
sepal_length    50
sepal_width     50
petal_length    50
petal_width     50
species         50
dtype: int64
