# Data Cleaning and Preparation

In [20]:
import numpy as np
import pandas as pd


## Data Transformation (continue)
So far in this lesson we’ve been concerned with rearranging data. Filtering, cleaning,
and other transformations are another class of important operations.

### Detecting and Filtering Outliers
Filtering or transforming outliers is largely a matter of applying array operations.
Consider a DataFrame with some normally distributed data

In [62]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.049823,-0.026423,-0.046308,-0.006503
std,0.983483,0.97863,0.988615,0.978914
min,-3.211306,-3.102846,-3.490998,-3.311623
25%,-0.697596,-0.640211,-0.688196,-0.701771
50%,-0.084963,-0.049962,-0.019894,-0.082467
75%,0.590374,0.626066,0.628955,0.652012
max,3.325254,2.94603,3.496885,3.0999


In [63]:
# find values in one of the columns exceeding 3 in absolute value
col = data[1]
mask = np.abs(col) > 3
col[mask]

465   -3.102846
Name: 1, dtype: float64

In [67]:
# select all rows having a value exceeding 3 or –3
mask = (np.abs(data) > 3).any(axis=1)
data[mask]

Unnamed: 0,0,1,2,3
35,-0.654248,-0.976455,3.110484,-0.049688
339,1.12759,-0.339088,-3.131155,-1.036263
465,-0.639655,-3.102846,2.379902,-1.420041
511,-1.727943,-0.690014,-3.490998,-0.626469
637,0.265868,0.65848,3.001715,-0.788486
710,0.925459,-2.304843,-0.757487,3.0999
715,-3.211306,1.00226,-0.484381,0.63409
778,3.325254,-0.492994,-0.856974,0.651879
837,0.241545,0.154841,3.496885,0.956301
987,-0.151818,0.2798,-0.458721,-3.311623


In [69]:
# set outliers to 3 or -3 depending on its sign
mask = np.abs(data) > 3
data[mask] = np.sign(data) * 3

In [70]:
data[35:]

Unnamed: 0,0,1,2,3
35,-0.654248,-0.976455,3.000000,-0.049688
36,0.616501,0.272866,0.277089,1.902250
37,0.166546,0.189966,-0.639385,-0.411277
38,1.405848,-1.091851,1.860222,0.176928
39,-0.403228,-1.569283,0.194979,1.509561
...,...,...,...,...
995,-1.362326,0.304114,-0.047984,0.088367
996,0.860622,1.423257,-1.623748,0.452949
997,-0.644347,2.211247,-0.743832,1.436688
998,-0.998013,0.552793,-0.812495,-1.742612


### Computing Indicator/Dummy Variables
Another type of transformation for statistical modeling or machine learning applica‐
tions is converting a categorical variable into a “dummy” or “indicator” matrix.

column in a DataFrame has k distinct values, you would derive a matrix or Data‐
Frame with k columns containing all 1s and 0s.

pandas has a `get_dummies` function for doing this

In [71]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [94]:
# create dummy variables for column 'key'
pd.get_dummies(df["key"], prefix="grade")

Unnamed: 0,grade_a,grade_b,grade_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [75]:
# create dummy variables for column 'key'  then add the columns to a variable
dummies = pd.get_dummies(df["key"], prefix="key")
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [77]:
# join the dummies to the Data-Frame
df.join(dummies).drop(columns="key")

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [78]:
# read the dataset 'movies.dat' and display the first 10 rows of it

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_csv('movies.dat', sep='::', engine="python",
                       header=None, names=mnames)
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


Adding dummy variables for each genre requires a little bit of wrangling.

In [81]:
# make a list 'genres' contains all distict genres
all_genres = []
for g in movies["genres"]:
    all_genres.extend(g.split("|"))
    
all_genres = set(all_genres)
all_genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [87]:
movies.shape

(3883, 3)

In [91]:
# create a DataFrame 'dummies' of size (#movies, #genres) and fill it with zeros
zero_matrix = np.zeros((len(movies), len(all_genres)), dtype=int)
dummies = pd.DataFrame(zero_matrix, columns=all_genres)
dummies

Unnamed: 0,Romance,Film-Noir,Western,Fantasy,Thriller,Comedy,Horror,Documentary,Mystery,War,Sci-Fi,Drama,Musical,Action,Crime,Animation,Children's,Adventure
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [99]:
# get the genres from the first row of 'movies' DataFrame, then get their indices in the 'dummies' DataFrame
# hint: use 'get_indexer' method
g = movies.loc[0, "genres"]
print(g.split("|"))
dummies.columns.get_indexer(g.split("|"))

['Animation', "Children's", 'Comedy']


array([15, 16,  5], dtype=int64)

In [100]:
# for each value of 'genre' column in 'movies' DataFrame, set the corresponding columns in dummies to 1 

for i, g in enumerate(movies["genres"]):
#     idxs = dummies.columns.get_indexer(g.split("|"))
#     dummies.iloc[i, idxs] = 1
    dummies.loc[i, g.split("|")] = 1
    
dummies

Unnamed: 0,Romance,Film-Noir,Western,Fantasy,Thriller,Comedy,Horror,Documentary,Mystery,War,Sci-Fi,Drama,Musical,Action,Crime,Animation,Children's,Adventure
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [103]:
# join the DataFrames 'movies' and 'dummies', then display the first row 
movies.join(dummies).drop(columns="genres")

Unnamed: 0,movie_id,title,Romance,Film-Noir,Western,Fantasy,Thriller,Comedy,Horror,Documentary,Mystery,War,Sci-Fi,Drama,Musical,Action,Crime,Animation,Children's,Adventure
0,1,Toy Story (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
1,2,Jumanji (1995),0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,3,Grumpier Old Men (1995),1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3880,3950,Tigerland (2000),0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3881,3951,Two Family House (2000),0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
