# Data Cleaning and Preparation

In [1]:
import numpy as np
import pandas as pd


## Data Transformation (continue)
So far in this lesson we’ve been concerned with rearranging data. Filtering, cleaning,
and other transformations are another class of important operations.

### Detecting and Filtering Outliers
Filtering or transforming outliers is largely a matter of applying array operations.
Consider a DataFrame with some normally distributed data

In [2]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.00638,-0.076284,0.01418,-0.032409
std,1.007102,1.000653,0.990268,0.972114
min,-3.247819,-3.387276,-3.322339,-3.280373
25%,-0.694036,-0.778923,-0.669937,-0.63556
50%,-0.027542,-0.086337,0.030056,-0.025112
75%,0.730032,0.625217,0.701318,0.606538
max,3.49111,2.798,2.948473,3.138961


In [3]:
# find values in one of the columns exceeding 3 in absolute value
col = data[1]
mask = np.abs(col) > 3
col[mask]

16    -3.387276
323   -3.153954
Name: 1, dtype: float64

In [4]:
# select all rows having a value exceeding 3 or –3
mask = (np.abs(data) > 3).any(axis=1)
data[mask]

Unnamed: 0,0,1,2,3
16,-0.897835,-3.387276,-0.638953,-0.484768
25,3.167861,0.125706,1.504303,-1.071938
260,3.49111,-0.251479,-0.737085,-0.60411
316,-1.32985,0.038888,-3.007976,-0.806446
323,-1.43952,-3.153954,-0.660191,1.173254
330,0.904838,1.178954,0.04867,3.138961
357,-3.172342,-1.002869,-1.080609,1.10424
558,-3.247819,1.018337,1.437345,-1.635376
609,-1.28745,0.029837,-3.322339,-0.316965
927,0.109111,-0.149493,0.090292,-3.280373


In [5]:
# set outliers to 3 or -3 depending on its sign
mask = np.abs(data) > 3
data[mask] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.006141,-0.075743,0.014732,-0.032267
std,1.0037,0.998981,0.988505,0.970772
min,-3.0,-3.0,-3.0,-3.0
25%,-0.694036,-0.778923,-0.669937,-0.63556
50%,-0.027542,-0.086337,0.030056,-0.025112
75%,0.730032,0.625217,0.701318,0.606538
max,3.0,2.798,2.948473,3.0


In [6]:
data[158:]

Unnamed: 0,0,1,2,3
158,0.118838,0.417501,0.869398,-0.280400
159,-0.599574,1.696093,1.690335,-0.538654
160,-0.457955,0.305282,-0.359543,-0.922075
161,-1.684535,-0.348736,-0.800830,-0.543906
162,0.593893,-0.569173,-0.244390,-0.199597
...,...,...,...,...
995,0.416445,1.126261,-0.255826,0.511936
996,0.394113,0.557455,-0.274399,0.327544
997,-0.664400,1.376779,-1.054643,-0.944743
998,-0.928455,-0.335061,-0.423978,0.304863


In [7]:
def replace_outlier(row):
    mask = row > 3
    row[mask] = 1
    mask = row < -3
    row[mask] = -1
    
    return row
    
temp = data.apply(replace_outlier, axis=1)


In [8]:
temp[146:]

Unnamed: 0,0,1,2,3
146,0.061876,2.254999,0.467590,0.041857
147,-0.037230,0.238502,0.478707,0.929587
148,0.727141,0.919915,-2.182048,-0.598981
149,0.506064,-0.358761,-0.312563,0.247119
150,0.510513,-0.714611,0.210912,1.155840
...,...,...,...,...
995,0.416445,1.126261,-0.255826,0.511936
996,0.394113,0.557455,-0.274399,0.327544
997,-0.664400,1.376779,-1.054643,-0.944743
998,-0.928455,-0.335061,-0.423978,0.304863


### Computing Indicator/Dummy Variables
Another type of transformation for statistical modeling or machine learning applica‐
tions is converting a categorical variable into a “dummy” or “indicator” matrix.

column in a DataFrame has k distinct values, you would derive a matrix or Data‐
Frame with k columns containing all 1s and 0s.

pandas has a `get_dummies` function for doing this

In [9]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [10]:
# create dummy variables for column 'key'
pd.get_dummies(df["key"], prefix="key")

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [11]:
# create dummy variables for column 'key'  then add the columns to a variable
dummies = pd.get_dummies(df["key"], prefix="key")

In [12]:
# join the dummies to the Data-Frame
df.join(dummies).drop(columns="key")

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [13]:
# read the dataset 'movies.dat' and display the first 10 rows of it

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_csv('movies.dat', sep='::', engine="python",
                       header=None, names=mnames)
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


Adding dummy variables for each genre requires a little bit of wrangling.

In [14]:
# make a list 'genres' contains all distict genres
all_genres = set()
for g in movies["genres"]:
    all_genres = all_genres.union(g.split("|"))
    
all_genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [15]:
zero_matrix = np.zeros((movies.shape[0], len(all_genres)), dtype=int)
dummies = pd.DataFrame(zero_matrix, columns=all_genres)
dummies

Unnamed: 0,Crime,Adventure,Mystery,Musical,Western,Film-Noir,Action,War,Documentary,Thriller,Drama,Romance,Comedy,Fantasy,Horror,Sci-Fi,Animation,Children's
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
# create a DataFrame 'dummies' of size (#movies, #genres) and fill it with zeros
movies.shape[0]

3883

In [17]:
# get the genres from the first row of 'movies' DataFrame, then get their indices in the 'dummies' DataFrame
# hint: use 'get_indexer' method

g = movies.loc[0, "genres"]
g.split("|")
dummies.columns.get_indexer(g.split("|"))

array([16, 17, 12], dtype=int64)

In [18]:
# for each value of 'genre' column in 'movies' DataFrame, set the corresponding columns in dummies to 1 

for i, g in enumerate(movies["genres"]):
    columns = dummies.columns.get_indexer(g.split("|"))
    dummies.iloc[i, columns] = 1

dummies

Unnamed: 0,Crime,Adventure,Mystery,Musical,Western,Film-Noir,Action,War,Documentary,Thriller,Drama,Romance,Comedy,Fantasy,Horror,Sci-Fi,Animation,Children's
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1
1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [19]:
# join the DataFrames 'movies' and 'dummies', then display the first row 
movies.join(dummies.add_prefix("genre_")).drop(columns="genres")

Unnamed: 0,movie_id,title,genre_Crime,genre_Adventure,genre_Mystery,genre_Musical,genre_Western,genre_Film-Noir,genre_Action,genre_War,genre_Documentary,genre_Thriller,genre_Drama,genre_Romance,genre_Comedy,genre_Fantasy,genre_Horror,genre_Sci-Fi,genre_Animation,genre_Children's
0,1,Toy Story (1995),0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1
1,2,Jumanji (1995),0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,3,Grumpier Old Men (1995),0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


## String Manipulation
Python has long been a popular raw data manipulation language in part due to its
ease of use for string and text processing. Most text operations are made simple with
the string object’s **built-in methods**. 

For more complex pattern matching and text manipulations, **regular expressions** may be needed. 

**pandas** adds to the mix by enabling you to apply string and regular expressions concisely on whole arrays of data,
additionally handling the annoyance of missing data.

### String Object Methods

In [20]:
# convert the string to a list using the ',' as separator
val = 'a,b,  guido'
val.split(",")

['a', 'b', '  guido']

In [21]:
# convert the string to a list using the ',' as separator and remove the extra spaces
pieces = val.split(",")
pieces = [v.strip() for v in pieces]
pieces

['a', 'b', 'guido']

In [22]:
# concatenate the parts back with separator '::'
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [23]:
# concatenate the parts back with separator '::'
"::".join(pieces)

'a::b::guido'

**check**: try the `in` operator and str methods `index`, `find`, `count` and `replace`

![](assets/built-in-str-methods.png)

### Regular Expressions
Regular expressions provide a flexible way to search or match (often more complex)
string patterns in text. A single expression, commonly called a regex, is a string
formed according to the regular expression language.

The `re` module functions fall into three categories: **pattern matching**, **substitution**,
and **splitting**.

In [24]:
import re
text = "foo    bar\t baz  \tqux"
# https://pythex.org



suppose we wanted to split a string with a variable number of whitespace characters
(tabs, spaces, and newlines). The regex describing one or more whitespace characters
is \s+:

In [25]:
# split the string depending on the whitespaces
# text.split(" ")
re.split("\s+", text)


['foo', 'bar', 'baz', 'qux']

In [26]:
# find all the string depending on the whitespaces
re.findall("\w+", text)


['foo', 'bar', 'baz', 'qux']

**Note:** Creating a regex object with `re.compile` is highly recommended if you intend to
apply the same expression to many strings; doing so will save CPU cycles

In [62]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [28]:
# get a list of all emails in the text
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

Relatedly, `sub` will return a new string with occurrences of the pattern replaced by the
a new string:


In [29]:
print(regex.sub("SECRET", text))

Dave SECRET
Steve SECRET
Rob SECRET
Ryan SECRET



Suppose you wanted to find email addresses and simultaneously segment each
address into its three components: *username*, *domain name*, and *domain suffix*.

In [64]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [31]:
# find all email in the text
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [32]:
# prefix each segment of the email with a suitable label
print(regex.sub(r"user: \1 @company: \2 .ext: \3", text))

Dave user: dave @company: google .ext: com
Steve user: steve @company: gmail .ext: com
Rob user: rob @company: gmail .ext: com
Ryan user: ryan @company: yahoo .ext: com



![](assets/re-methods.png)

### Vectorized String Functions in pandas

In [33]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data


Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

Series has array-oriented methods for string operations that skip NA values. These are accessed through Series’s **str attribute**

In [57]:
# check if the email is gmail
mask = data.str.contains("gmail").fillna(False)
data[mask]

Steve    steve@gmail.com
Rob        rob@gmail.com
dtype: object

Regular expressions can be used, too, along with any re options like IGNORECASE

In [65]:
# using the pattern declared earlier, find all parts of each email
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [71]:
# use the match method to check if the field matches an email or not
data.str.match(pattern, flags=re.IGNORECASE)

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

![](assets/series-str-methods.png)

# Data Aggregation and Group Operations

## GroupBy Mechanics

the term split-apply-combine is used for describing group operations. 

- In the first stage of the process, data contained in a pandas object, whether a Series, Data‐Frame, or otherwise, is **split** into groups based on one or more keys that you provide.
- Once this is done, a function is **applied** to each group, producing a new value. 
- Finally, the results of all those function applications are **combined** into a result object. 

The form of the resulting object will usually depend on what’s being done to the data.

![](assets/group-aggregation.png)

In [78]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randint(0, 10, 7),
                   'data2' : np.random.randint(0, 10, 7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,5,1
1,a,two,5,7
2,b,one,7,9
3,b,two,3,2
4,a,one,2,7
5,b,two,5,1
6,a,one,6,4


In [79]:
# group data of column 'data1' by 'key1' then print the groups
temp = df["data1"].groupby(df["key1"])
temp.groups

{'a': [0, 1, 4, 6], 'b': [2, 3, 5]}

In [80]:
# calculate the mean in each group
temp.mean()

key1
a    4.5
b    5.0
Name: data1, dtype: float64

In [81]:
# group data of column 'data1' by 'key1' and 'key2' then print the groups
temp = df["data1"].groupby([df["key1"], df["key2"]])
temp.groups

{('a', 'one'): [0, 4, 6], ('a', 'two'): [1], ('b', 'one'): [2], ('b', 'two'): [3, 5]}

In [83]:
# calculate the mean in each group
result = temp.mean()
result

key1  key2
a     one     4.333333
      two     5.000000
b     one     7.000000
      two     4.000000
Name: data1, dtype: float64

In [84]:
result[('a', 'two')]

5.0

In [86]:
# unstack the result Series
result = result.reset_index()
result

Unnamed: 0,key1,key2,data1
0,a,one,4.333333
1,a,two,5.0
2,b,one,7.0
3,b,two,4.0


In [87]:
# try some selection on the result DataFrame
result.loc[1, "data1"]

5.0

### Iterating Over Groups
The GroupBy object supports iteration, generating a sequence of **2-tuples** containing
the **group name** along with the **chunk of data**.

In [91]:
# group data of DataFrame 'data' by 'key1' then print each group name and data
temp = df.groupby("key1")
for name, group in temp:
    print(name)
    print(group)
    print("-" * 25)


a
  key1 key2  data1  data2
0    a  one      5      1
1    a  two      5      7
4    a  one      2      7
6    a  one      6      4
-------------------------
b
  key1 key2  data1  data2
2    b  one      7      9
3    b  two      3      2
5    b  two      5      1
-------------------------


In [92]:
# group data of DataFrame 'data' by 'key1' and 'key2' then print each group name and data
temp = df.groupby(["key1", "key2"])
for name, group in temp:
    print(name)
    print(group)
    print("-" * 25)

('a', 'one')
  key1 key2  data1  data2
0    a  one      5      1
4    a  one      2      7
6    a  one      6      4
-------------------------
('a', 'two')
  key1 key2  data1  data2
1    a  two      5      7
-------------------------
('b', 'one')
  key1 key2  data1  data2
2    b  one      7      9
-------------------------
('b', 'two')
  key1 key2  data1  data2
3    b  two      3      2
5    b  two      5      1
-------------------------


In [98]:
# group data of DataFrame 'data' by 'key1' then convert it to a dictionary of DataFrames
temp = dict(list(df.groupby("key1")))
temp["a"]



Unnamed: 0,key1,key2,data1,data2
0,a,one,5,1
1,a,two,5,7
4,a,one,2,7
6,a,one,6,4


In [107]:
# group the columns by its datatypes, then print the groups
temp = df.groupby(df.dtypes, axis=1)
temp.groups

{int32: ['data1', 'data2'], object: ['key1', 'key2']}

key1     object
key2     object
data1     int32
data2     int32
dtype: object

In [108]:
# loop through the groups and print it
for name, group in temp:
    print(name)
    print(group)

int32
   data1  data2
0      5      1
1      5      7
2      7      9
3      3      2
4      2      7
5      5      1
6      6      4
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
5    b  two
6    a  one


### Selecting a Column or Subset of Columns
Indexing a GroupBy object created from a DataFrame with a column name or array
of column names has the effect of column subsetting for aggregation. This means
that:
```python
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
```
are syntactic sugar for:
```python
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
```


In [109]:
# group data of DataFrame 'data' by 'key1' then calculate the mean of column 'data2'
df.groupby("key1")["data2"].mean()


key1
a    4.75
b    4.00
Name: data2, dtype: float64

In [110]:
# How to get result as DataFrame GroupBy or Series GroupBy?
df.groupby("key1")[["data2"]].mean()


Unnamed: 0_level_0,data2
key1,Unnamed: 1_level_1
a,4.75
b,4.0


### Grouping with Dicts and Series

In [111]:
people = pd.DataFrame(np.random.randint(0, 10, (5,5)),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,6,9.0,7.0,7,6
Steve,4,1.0,4.0,4,8
Wes,4,,,8,6
Jim,8,5.0,6.0,0,2
Travis,0,0.0,7.0,1,0


In [112]:
mapping = {'Joe': 'red', 'Steve': 'red', 'Wes': 'blue',
           'Jim': 'blue', 'Travis': 'red', 'Elon' : 'orange'}

In [113]:
# group and sum the scores of teams red and blue
people.groupby(mapping).sum()

Unnamed: 0,a,b,c,d,e
blue,12,5.0,6.0,8,8
red,10,10.0,18.0,12,14


In [117]:
# convert the dict to a Series, then group and count the scores of teams red and blue
s = pd.Series(mapping)
people.groupby(s).sum()

Unnamed: 0,a,b,c,d,e
blue,12,5.0,6.0,8,8
red,10,10.0,18.0,12,14


### Grouping with Functions

In [118]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,18,14.0,13.0,15,14
5,4,1.0,4.0,4,8
6,0,0.0,7.0,1,0


## independent Practice:
- read the dataset `tips.csv`
- create a new column 'tip_pct', which is tip / total_bill
- replace the short-day name with the full-day name and convert it to upper-case
- calculate the average tip percent for smokers and non-smokers
- calculate the max and average tip percent for each time
- calculate the average tip percent for each day and time
- create dummy variables for the day and time columns


