# Data Cleaning and Preparation

In [41]:
import numpy as np
import pandas as pd


## Data Transformation (continue)
So far in this lesson we’ve been concerned with rearranging data. Filtering, cleaning,
and other transformations are another class of important operations.

### Detecting and Filtering Outliers
Filtering or transforming outliers is largely a matter of applying array operations.
Consider a DataFrame with some normally distributed data

In [64]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.003001,-0.018148,-0.011135,0.05711
std,0.980871,0.945657,0.972788,1.021284
min,-3.075926,-3.306235,-3.448126,-2.500405
25%,-0.653496,-0.61483,-0.641959,-0.642597
50%,0.025307,-0.012635,-0.012053,0.038357
75%,0.653528,0.589103,0.626383,0.798413
max,3.344496,3.043223,3.274921,3.115603


In [56]:
# find values in one of the columns exceeding 3 in absolute value
(data.abs() > 3).any(axis=1).sum()


0

In [48]:
# select all rows having a value exceeding 3 or –3
data[(data.abs() > 3).any(axis=1)]

Unnamed: 0,0,1,2,3
10,-3.033364,-0.611171,0.479827,0.783402
12,-1.863468,-1.418174,3.347265,1.22306
148,1.056265,1.014032,-0.649791,-3.273554
403,-1.444995,-1.345817,-3.409303,0.511956
527,-3.021125,1.457269,1.807526,0.31973
542,0.049349,-0.106161,-3.062846,0.757072
664,-0.226122,3.267315,0.063392,-0.385918
705,-3.034553,-1.047301,-0.782805,1.801911


In [55]:
# set outliers to 3 or -3 depending on its sign
rows, columns = data.shape

for i in range(rows):
    for j in range(columns):
        v = data.iloc[i, j]
        if v > 3:
            data.iloc[i, j] = 3
        elif v < -3:
            data.iloc[i, j] = -3

data[(data.abs() > 3).any(axis=1)]

Unnamed: 0,0,1,2,3


### Computing Indicator/Dummy Variables
Another type of transformation for statistical modeling or machine learning applica‐
tions is converting a categorical variable into a “dummy” or “indicator” matrix.

column in a DataFrame has k distinct values, you would derive a matrix or Data‐
Frame with k columns containing all 1s and 0s.

pandas has a `get_dummies` function for doing this

In [68]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [72]:
# create dummy variables for column 'key'
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [75]:
# add a prefix to the columns in the indicator Data‐Frame
dummies = pd.get_dummies(df["key"], prefix="key")
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [78]:
# join the dummies to the Data-Frame
df.join(dummies).drop(columns="key")

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [83]:
# read the dataset 'movies.dat' and display the first 10 rows of it
col_names = ["id", "title", "genre"]

movies = pd.read_csv("movies.dat", sep="::", engine="python", names=col_names)
movies

Unnamed: 0,id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


Adding dummy variables for each genre requires a little bit of wrangling.

In [93]:
# make a list 'genres' contains all distict genres
distinct_genres = set()
for genre in movies["genre"]:
    distinct_genres = distinct_genres.union(genre.split("|"))

distinct_genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [97]:
# create a DataFrame 'dummies' of size (#movies, #genres) and fill it with zeros

dummies = pd.DataFrame(np.zeros((movies.shape[0], len(distinct_genres)), dtype=int), columns=distinct_genres)
dummies

Unnamed: 0,War,Action,Documentary,Fantasy,Musical,Children's,Western,Drama,Thriller,Crime,Film-Noir,Sci-Fi,Adventure,Horror,Mystery,Comedy,Animation,Romance
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
# get the genres from the first row of 'movies' DataFrame, then get their indices in the 'dummies' DataFrame
# hint: use 'get_indexer' method


In [98]:
# for each value of 'genre' column in 'movies' DataFrame, set the corresponding columns in dummies to 1 

for i, genre in enumerate(movies["genre"]):
    for one_genre in genre.split("|"):
        dummies.loc[i, one_genre] = 1

dummies

Unnamed: 0,War,Action,Documentary,Fantasy,Musical,Children's,Western,Drama,Thriller,Crime,Film-Noir,Sci-Fi,Adventure,Horror,Mystery,Comedy,Animation,Romance
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [99]:
# join the DataFrames 'movies' and 'dummies', then display the first row 
movies.join(dummies).drop(columns="genre")

Unnamed: 0,id,title,War,Action,Documentary,Fantasy,Musical,Children's,Western,Drama,Thriller,Crime,Film-Noir,Sci-Fi,Adventure,Horror,Mystery,Comedy,Animation,Romance
0,1,Toy Story (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
1,2,Jumanji (1995),0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
3,4,Waiting to Exhale (1995),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3879,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


## String Manipulation
Python has long been a popular raw data manipulation language in part due to its
ease of use for string and text processing. Most text operations are made simple with
the string object’s **built-in methods**. 

For more complex pattern matching and text manipulations, **regular expressions** may be needed. 

**pandas** adds to the mix by enabling you to apply string and regular expressions concisely on whole arrays of data,
additionally handling the annoyance of missing data.

### String Object Methods

In [101]:
# convert the string to a list using the ',' as separator
val = "a,b,  guido"
val = val.split(",")
val

['a', 'b', '  guido']

In [104]:
# convert the string to a list using the ',' as separator and remove the extra spaces
val = [v.strip() for v in val]
val

['a', 'b', 'guido']

In [105]:
# concatenate the parts back with separator '::'
val[0] + '::' + val[1] + '::' + val[2]

'a::b::guido'

In [110]:
# concatenate the parts back with separator '::'
'::'.join(val)

'a::b::guido'

**check**: try the `in` operator and str methods `index`, `find`, `count` and `replace`

In [114]:
email = "omarincs@gmail.com"
"@" in email

True

In [116]:
email.index("@")
# email.index("&")

ValueError: substring not found

In [118]:
email.find("@")

8

In [120]:
email.count("m")

3

In [121]:
email.replace(".com", ".net")

'omarincs@gmail.net'

![](assets/built-in-str-methods.png)

### Regular Expressions
Regular expressions provide a flexible way to search or match (often more complex)
string patterns in text. A single expression, commonly called a regex, is a string
formed according to the regular expression language.

The `re` module functions fall into three categories: **pattern matching**, **substitution**,
and **splitting**.

In [122]:
import re
text = "foo    bar\t baz  \tqux"


suppose we wanted to split a string with a variable number of whitespace characters
(tabs, spaces, and newlines). The regex describing one or more whitespace characters
is \s+:

In [124]:
# split the string depending on the whitespaces

re.split("\s+", text)

['foo', 'bar', 'baz', 'qux']

In [125]:
# find all the string depending on the whitespaces
re.findall("\s+", text)

['    ', '\t ', '  \t']

**Note:** Creating a regex object with `re.compile` is highly recommended if you intend to
apply the same expression to many strings; doing so will save CPU cycles

In [131]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [132]:
# get a list of all emails in the text
regex.findall(text)


['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

Relatedly, `sub` will return a new string with occurrences of the pattern replaced by the
a new string:


In [134]:
print(regex.sub("<email>", text))

Dave <email>
Steve <email>
Rob <email>
Ryan <email>



Suppose you wanted to find email addresses and simultaneously segment each
address into its three components: *username*, *domain name*, and *domain suffix*.

In [135]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [136]:
# find all email in the text
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [138]:
# prefix each segment of the email with a suitable label

print(regex.sub(r"Username: \1, Domain: \2, Ext.: \3", text))

Dave Username: dave, Domain: google, Ext.: com
Steve Username: steve, Domain: gmail, Ext.: com
Rob Username: rob, Domain: gmail, Ext.: com
Ryan Username: ryan, Domain: yahoo, Ext.: com



![](assets/re-methods.png)

### Vectorized String Functions in pandas

In [57]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data


Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

Series has array-oriented methods for string operations that skip NA values. These are accessed through Series’s **str attribute**

In [22]:
# check if the email is gmail


Regular expressions can be used, too, along with any re options like IGNORECASE

In [23]:
# using the pattern declared earlier, find all parts of each email


In [24]:
# use the match method to check if the field matches an email or not


![](assets/series-str-methods.png)

# Data Aggregation and Group Operations

## GroupBy Mechanics

the term split-apply-combine is used for describing group operations. 

- In the first stage of the process, data contained in a pandas object, whether a Series, Data‐Frame, or otherwise, is **split** into groups based on one or more keys that you provide.
- Once this is done, a function is **applied** to each group, producing a new value. 
- Finally, the results of all those function applications are **combined** into a result object. 

The form of the resulting object will usually depend on what’s being done to the data.

![](assets/group-aggregation.png)

In [62]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randint(0, 10, 5),
                   'data2' : np.random.randint(0, 10, 5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,5,7
1,a,two,1,5
2,b,one,4,8
3,b,two,4,4
4,a,one,5,5


In [25]:
# group data of column 'data1' by 'key1' then print the groups


In [26]:
# calculate the mean in each group


In [27]:
# group data of column 'data1' by 'key1' and 'key2' then print the groups


In [28]:
# calculate the mean in each group


In [29]:
# unstack the result Series


In [30]:
# try some selection on the result DataFrame


### Iterating Over Groups
The GroupBy object supports iteration, generating a sequence of **2-tuples** containing
the **group name** along with the **chunk of data**.

In [31]:
# group data of DataFrame 'data' by 'key1' then print each group name and data



In [32]:
# group data of DataFrame 'data' by 'key1' and 'key2' then print each group name and data



In [33]:
# group data of DataFrame 'data' by 'key1' then convert it to a dictionary of DataFrames


In [34]:
# group the columns by its datatypes, then print the groups


In [35]:
# loop through the groups and print it


### Selecting a Column or Subset of Columns
Indexing a GroupBy object created from a DataFrame with a column name or array
of column names has the effect of column subsetting for aggregation. This means
that:
```python
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
```
are syntactic sugar for:
```python
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
```


In [36]:
# group data of DataFrame 'data' by 'key1' then calculate the mean of column 'data2'



In [37]:
# How to get result as DataFrame GroupBy or Series GroupBy?


### Grouping with Dicts and Series

In [103]:
people = pd.DataFrame(np.random.randint(0, 10, (5,5)),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,0,7.0,2.0,9,5
Steve,8,8.0,9.0,3,6
Wes,7,,,8,1
Jim,0,0.0,5.0,3,5
Travis,7,0.0,0.0,3,8


In [104]:
mapping = {'Joe': 'red', 'Steve': 'red', 'Wes': 'blue',
           'Jim': 'blue', 'Travis': 'red', 'Elon' : 'orange'}

In [38]:
# group and sum the scores of teams red and blue


In [39]:
# convert the dict to a Series, then group and count the scores of teams red and blue



### Grouping with Functions

In [40]:
# group with the len function


## independent Practice:
- read the dataset `tips.csv`
- create a new column 'tip_pct', which is tip / total_bill
- replace the short-day name with the full-day name and convert it to upper-case
- calculate the average tip percent for smokers and non-smokers
- calculate the max and average tip percent for each time
- calculate the average tip percent for each day and time
- create dummy variables for the day and time columns


