# Chapter 3: Data Wrangling

In [1]:
import pandas as pd

## 3.0 Introduction

In [4]:
# url = 'https://tinyurl.com/titanic-csv'
url = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [5]:
df = pd.read_csv(url)

In [6]:
df.head()

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


## 3.1 Creating a Data Frame

Create an empty DF and define each column separately:

In [7]:
df = pd.DataFrame()

In [8]:
df['Name'] = ['Jacky Jackson', 'Steven Stevenson']
df['Age'] = [38, 25]
df['Driver'] = [True, False]

In [9]:
df

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


Create and append a row:

In [10]:
newPerson = pd.Series(['Molly Mooney', 40, True], index=['Name', 'Age', 'Driver'])
df.append(newPerson, ignore_index=True)

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False
2,Molly Mooney,40,True


Note: we did not assign this to the df so row was not added in-place.

## 3.2 Describing the Data

In [12]:
# url = 'https://tinyurl.com/titanic-csv'
url = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [13]:
df = pd.read_csv(url)

In [14]:
df.head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [15]:
df.shape

(1313, 6)

In [16]:
df.describe()

Unnamed: 0,Age,Survived,SexCode
count,756.0,1313.0,1313.0
mean,30.397989,0.342727,0.351866
std,14.259049,0.474802,0.477734
min,0.17,0.0,0.0
25%,21.0,0.0,0.0
50%,28.0,0.0,0.0
75%,39.0,1.0,1.0
max,71.0,1.0,1.0


## 3.3 Navigating DataFrames

In [17]:
# url = 'https://tinyurl.com/titanic-csv'
url = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [18]:
df = pd.read_csv(url)

In [19]:
df.iloc[0]

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                   29
Sex                               female
Survived                               1
SexCode                                1
Name: 0, dtype: object

In [20]:
df.iloc[:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [21]:
dfNI = df.set_index(df['Name'])

In [23]:
dfNI.loc['Allen, Miss Elisabeth Walton']

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                   29
Sex                               female
Survived                               1
SexCode                                1
Name: Allen, Miss Elisabeth Walton, dtype: object

## 3.4 Selecting Rows Based on Conditionals

In [24]:
# url = 'https://tinyurl.com/titanic-csv'
url = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [25]:
df = pd.read_csv(url)

Show top 2 rows where 'Sex' is 'female':

In [26]:
df[df['Sex'] == 'female'].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


Female passengers 65 & older:

In [28]:
df[(df['Sex'] == 'female') & (df['Age'] >= 65)]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
73,"Crosby, Mrs Edward Gifford (Catherine Elizabet...",1st,69.0,female,1,1


## 3.5 Replacing Values

In [17]:
url = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [18]:
df = pd.read_csv(url)

In [32]:
df['Sex'].replace("female", "Woman").head()

0    Woman
1    Woman
2     male
3    Woman
4     male
Name: Sex, dtype: object

Replace multiple values at the same time:

In [30]:
df['Sex'].replace(["female", "male"], ["Woman", "Man"]).head()

0    Woman
1    Woman
2      Man
3    Woman
4      Man
Name: Sex, dtype: object

Replace throughout entire DF:

In [31]:
df.replace(1, "One").head()

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,One,One
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,One
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,One
4,"Allison, Master Hudson Trevor",1st,0.92,male,One,0


`.replace()` also accepts regex:

In [33]:
df.replace(r"1st", "First", regex=True).head()

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",First,29.0,female,1,1
1,"Allison, Miss Helen Loraine",First,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",First,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",First,25.0,female,0,1
4,"Allison, Master Hudson Trevor",First,0.92,male,1,0


## 3.6 Renaming Columns

`.rename()`

In [34]:
url = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [35]:
df = pd.read_csv(url)

In [37]:
df.rename(columns={'PClass': 'Passenger Class', 'Sex': 'Gender'}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


#### Create a dict with old column names as keys:

In [39]:
import collections

In [40]:
colNames = collections.defaultdict(str)

In [41]:
colNames

defaultdict(str, {})

In [42]:
for name in df.columns:
    colNames[name]

In [43]:
colNames

defaultdict(str,
            {'Name': '',
             'PClass': '',
             'Age': '',
             'Sex': '',
             'Survived': '',
             'SexCode': ''})

## 3.7 Finding the Minimum, Maximum, Sum, Average, and Count

In [44]:
url = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [45]:
df = pd.read_csv(url)

In [46]:
print('Maximum:', df['Age'].max())
print('Minimum:', df['Age'].min())
print('Mean:', df['Age'].mean())
print('Sum:', df['Age'].sum())
print('Count:', df['Age'].count())

Maximum: 71.0
Minimum: 0.17
Mean: 30.397989417989415
Sum: 22980.88
Count: 756


Can also apply to entire DF:

In [47]:
df.count()

Name        1313
PClass      1313
Age          756
Sex         1313
Survived    1313
SexCode     1313
dtype: int64

## 3.8 Finding Unique Values