# Section Three - Exploratory data analysis in Python


We talk about Python concepts of data types, lambdas, and string functions. These concepts are then applied to the task of cleaning our dataset.


# Data structures and scientific libraries for data manipulation

In [1]:
import numpy as np
import pandas as pd

## Series
`s = pd.Series(data, index=idx)`

In [2]:
pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

a    0.258874
b   -1.032803
c    0.237299
d   -0.140834
e   -0.471891
dtype: float64

In [3]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [4]:
pd.Series(np.random.randn(5))

0    0.151800
1    0.724344
2    0.593627
3   -0.411102
4   -1.117451
dtype: float64

In [5]:
pd.Series({'b': 331, 'a': 124, 'c': 278})

b    331
a    124
c    278
dtype: int64

In [6]:
pd.Series(
    {'b': 331, 'a': 124, 'c': 278},
    index=['b', 'c', 'd', 'a']
)

b    331.0
c    278.0
d      NaN
a    124.0
dtype: float64

In [7]:
pd.Series(42, index=['a', 'b', 'c', 'd', 'e'])

a    42
b    42
c    42
d    42
e    42
dtype: int64

In [9]:
{'one': pd.Series(range(3), index=['a', 'b', 'c']),
 'two': pd.Series(range(4), index=['a', 'b', 'c', 'd'])}

{'one': a    0
 b    1
 c    2
 dtype: int64, 'two': a    0
 b    1
 c    2
 d    3
 dtype: int64}

In [10]:
pd.DataFrame({'one': pd.Series(range(3), index=['a', 'b', 'c']),
 'two': pd.Series(range(4), index=['a', 'b', 'c', 'd'])})

Unnamed: 0,one,two
a,0.0,0
b,1.0,1
c,2.0,2
d,,3


In [11]:
pd.DataFrame({'one': pd.Sberies(range(3), index=['a', 'b', 'c']),
 'two': pd.Series(range(4), index=['a', 'b', 'c', 'd'])}, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,3
b,1.0,1
a,0.0,0


In [13]:
pd.DataFrame(
    {
        'one': pd.Series(range(3), index=['a', 'b', 'c']),
         'two': pd.Series(range(4), index=['a', 'b', 'c', 'd'])
    }, 
    index=['d', 'b', 'a'],
    columns=['two', 'three']
)

Unnamed: 0,two,three
d,3,
b,1,
a,0,


## Text cleaning with string functions

Most courses teach you how to use Pythonic string functions in Pandas. Not us. Let's go through some interesting examples of Pandas string and text manipulation.

In [23]:
import pandas as pd
import numpy as np

In [24]:
pd.Series(['a', 'b', 'c', 'd']).str.cat(sep=', ')

'a, b, c, d'

In [25]:
pd.Series(["123.0", np.nan, "33.4"]).str.cat(sep=",",na_rep="-")

'123.0,-,33.4'

In [26]:
s = pd.Series(['1', '01011', 'C', 'Aaba', 'Baca'])
s

0        1
1    01011
2        C
3     Aaba
4     Baca
dtype: object

In [27]:
s.str[0]

0    1
1    0
2    C
3    A
4    B
dtype: object

In [28]:
s.str[:3]

0      1
1    010
2      C
3    Aab
4    Bac
dtype: object

In [29]:
# 2 columns because 2 capture groups
s.str.extract(r'([AB])?(\d)?', expand=False)

Unnamed: 0,0,1
0,,1.0
1,,0.0
2,,
3,A,
4,B,


In [30]:
# creating indicator variables
s = pd.Series(['feature_a,feature_b', 'feature_b', np.nan, 'feature_b,feature_c'])
s.str.get_dummies(sep=',')

Unnamed: 0,feature_a,feature_b,feature_c
0,1,1,0
1,0,1,0
2,0,0,0
3,0,1,1


In [31]:
# Common usecase is to clean up column names
df = pd.DataFrame(
        np.random.randn(5, 2),
        columns=[' First Name ', 'Last Name'],
        index=range(5))

# df.columns is an index
type(df.columns)

pandas.core.indexes.base.Index

In [32]:
df.columns.str.strip().str.lower().str.replace(" ", "_")

Index(['first_name', 'last_name'], dtype='object')

In [33]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df

Unnamed: 0,first_name,last_name
0,-0.861825,-0.10111
1,-0.485833,-0.275869
2,1.272726,1.884049
3,1.029716,0.476001
4,-0.267733,0.284021


In [34]:
# create a series of lists

pd.Series(['a_b_c', 'c_d_e', 'f_g_h']).str.split('_')

0    [a, b, c]
1    [c, d, e]
2    [f, g, h]
dtype: object

In [35]:
# access individual elements

pd.Series(['a_b_c', 'c_d_e', 'f_g_h']).str.split('_').str.get(1)

0    b
1    d
2    g
dtype: object