## Data Cleaning and Preparation

In [1]:
#Transforming Data Using a Lambda Function
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

<IPython.core.display.Javascript object>

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [2]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

In [3]:
data['animal'] = data['food'].map(meat_to_animal)

In [4]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,
4,corned beef,7.5,cow
5,Bacon,8.0,
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [5]:
lowercased = data['food'].str.lower()

In [6]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [7]:
data['animal'] = lowercased.map(meat_to_animal)

In [8]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [9]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [10]:
#Replacing Values inplace of missing values
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data


<IPython.core.display.Javascript object>

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [11]:
data.replace(-999, np.NAN)

<IPython.core.display.Javascript object>

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [12]:
data.replace([-999, -1000], np.nan)

<IPython.core.display.Javascript object>

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [13]:
data.replace([-999, -1000], [np.nan, 0])

<IPython.core.display.Javascript object>

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [14]:
data.replace({-999: np.nan, -1000: 0})

<IPython.core.display.Javascript object>

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [15]:
#Renaming Axis Indexes

df = pd.DataFrame(np.arange(12).reshape(4, -1), index = ['aa', 'bb', 'cc', 'dd'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
df

Unnamed: 0,0,1,2
aa,0,1,2
bb,3,4,5
cc,6,7,8
dd,9,10,11


In [17]:
df = pd.DataFrame(np.arange(12).reshape(4, -1), index = ['aa', 'bb', 'cc', 'dd'], columns=['One', 'Two', 'Three'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
df

Unnamed: 0,One,Two,Three
aa,0,1,2
bb,3,4,5
cc,6,7,8
dd,9,10,11


In [19]:
transform = lambda x: x[0] + x[1:].upper()

df.index.map(transform)

Index(['aA', 'bB', 'cC', 'dD'], dtype='object')

In [20]:
df

Unnamed: 0,One,Two,Three
aa,0,1,2
bb,3,4,5
cc,6,7,8
dd,9,10,11


In [21]:
df.index = df.index.map(transform)

In [22]:
df

Unnamed: 0,One,Two,Three
aA,0,1,2
bB,3,4,5
cC,6,7,8
dD,9,10,11


In [23]:
df.rename(index = str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE
Aa,0,1,2
Bb,3,4,5
Cc,6,7,8
Dd,9,10,11


In [24]:
df.rename(index = {'aA': 'Great'},  columns = {'One': 'Ones'})

Unnamed: 0,Ones,Two,Three
Great,0,1,2
bB,3,4,5
cC,6,7,8
dD,9,10,11


In [25]:
# binning or bucketizing for continuous variables
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

In [26]:
cats = pd.cut(ages, bins)

<IPython.core.display.Javascript object>

In [27]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [28]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)