# Data Cleaning and Preparation

In [14]:
import numpy as np
import pandas as pd


## Data Transformation (continue)
So far in this lesson we’ve been concerned with rearranging data. Filtering, cleaning,
and other transformations are another class of important operations.

### Detecting and Filtering Outliers
Filtering or transforming outliers is largely a matter of applying array operations.
Consider a DataFrame with some normally distributed data

In [28]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.011566,-0.051987,-0.030358,0.012831
std,1.010799,1.038626,0.994568,0.997572
min,-3.432242,-3.428033,-3.226173,-2.894898
25%,-0.705157,-0.724741,-0.669653,-0.697291
50%,0.059123,-0.01375,-0.034152,0.019323
75%,0.693937,0.663196,0.615788,0.703141
max,3.406387,3.475082,3.057017,3.290054


In [29]:
# find values in one of the columns exceeding 3 in absolute value
col = data[0]
mask = np.abs(col) > 3
col[mask]

13    -3.432242
102   -3.079539
388    3.406387
511   -3.265425
Name: 0, dtype: float64

In [30]:
# select all rows having a value exceeding 3 or –3
mask = (np.abs(data) > 3).any(axis=1)
data[mask]
# mask

Unnamed: 0,0,1,2,3
13,-3.432242,0.882086,1.734271,-0.248742
69,0.800872,-0.276525,-0.104884,3.290054
102,-3.079539,-0.472218,-0.497735,-1.65147
125,1.864485,-3.368123,-0.3768,-1.015632
228,1.346163,3.475082,1.355138,0.402717
388,3.406387,-2.285412,-0.1826,-1.174666
490,-0.362143,-1.381242,-3.226173,0.202126
511,-3.265425,0.097716,-1.544682,0.307411
641,-0.997897,0.317621,-0.425694,3.0013
648,-0.326658,-0.859325,3.057017,-0.304346


In [27]:
# set outliers to 0
mask = np.abs(data) > 3
data[mask] = 0
data[131:]

Unnamed: 0,0,1,2,3
131,0.567064,0.195255,0.299537,0.000000
132,1.459778,-0.586301,-0.138232,0.125145
133,-1.219894,-0.793502,0.343124,-0.106515
134,-1.160640,-0.071491,0.638425,-0.342781
135,-0.635406,-0.688472,0.614642,-0.899394
...,...,...,...,...
995,0.387767,0.648770,-0.327041,0.462798
996,0.280081,-1.230045,-1.301408,0.497259
997,-0.406944,0.656039,1.129938,0.260132
998,-0.994624,-1.022478,1.897194,-0.919584


In [31]:
# set outliers to 3 or -3 depending on its sign
mask = np.abs(data) > 3
data[mask] = np.sign(data) * 3
data[13:]

Unnamed: 0,0,1,2,3
13,-3.000000,0.882086,1.734271,-0.248742
14,-1.221174,0.118319,1.877130,-0.218274
15,-1.459491,2.817924,0.347845,0.974651
16,-0.187360,-0.627718,-0.555475,0.056207
17,1.039158,-0.268319,-0.255859,1.058700
...,...,...,...,...
995,-0.920921,-0.175951,0.621926,0.664901
996,-0.171125,-0.947342,-0.470228,2.063274
997,0.447199,1.460497,1.079294,1.641144
998,-0.763114,0.953122,1.788797,0.419696


### Computing Indicator/Dummy Variables
Another type of transformation for statistical modeling or machine learning applica‐
tions is converting a categorical variable into a “dummy” or “indicator” matrix.

column in a DataFrame has k distinct values, you would derive a matrix or Data‐
Frame with k columns containing all 1s and 0s.

pandas has a `get_dummies` function for doing this

In [32]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [35]:
# create dummy variables for column 'key'
pd.get_dummies(df["key"], prefix="grade")

Unnamed: 0,grade_a,grade_b,grade_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [36]:
# create dummy variables for column 'key'  then add the columns to a variable
dummies = pd.get_dummies(df["key"], prefix="grade")
dummies

Unnamed: 0,grade_a,grade_b,grade_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [38]:
# join the dummies to the Data-Frame
df.join(dummies).drop(columns="key")

Unnamed: 0,data1,grade_a,grade_b,grade_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [41]:
# read the dataset 'movies.dat' and display the first 10 rows of it
cnames = ["id", "title", "genre"]
movies = pd.read_csv("movies.dat", sep="::", names=cnames, engine="python", encoding="ISO-8859-1")
movies

Unnamed: 0,id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


Adding dummy variables for each genre requires a little bit of wrangling.

In [42]:
# make a list 'genres' contains all distinct genres
genres = []
for g in movies["genre"]:
    genres.extend(g.split("|"))

genres = set(genres)
genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [43]:
# print the dataset dimentions
movies.shape

(3883, 3)

In [46]:
# create a DataFrame 'dummies' of size (#movies, #genres) and fill it with zeros
dummies = np.zeros((len(movies), len(genres)), dtype=int)
dummies = pd.DataFrame(dummies, columns=genres)
dummies

Unnamed: 0,War,Thriller,Documentary,Film-Noir,Children's,Action,Adventure,Western,Comedy,Romance,Musical,Fantasy,Horror,Crime,Animation,Mystery,Sci-Fi,Drama
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
# get the genres from the first row of 'movies' DataFrame, then get their indices in the 'dummies' DataFrame
# hint: use 'get_indexer' method


In [47]:
# for each value of 'genre' column in 'movies' DataFrame, set the corresponding columns in dummies to 1 

for i, g in enumerate(movies["genre"]):
    cols = g.split("|")
    dummies.loc[i, cols] = 1

dummies

Unnamed: 0,War,Thriller,Documentary,Film-Noir,Children's,Action,Adventure,Western,Comedy,Romance,Musical,Fantasy,Horror,Crime,Animation,Mystery,Sci-Fi,Drama
0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0
1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3881,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [50]:
# join the DataFrames 'movies' and 'dummies', then display the first row 

movies.join(dummies.add_prefix("genre_")).drop(columns="genre")

Unnamed: 0,id,title,genre_War,genre_Thriller,genre_Documentary,genre_Film-Noir,genre_Children's,genre_Action,genre_Adventure,genre_Western,genre_Comedy,genre_Romance,genre_Musical,genre_Fantasy,genre_Horror,genre_Crime,genre_Animation,genre_Mystery,genre_Sci-Fi,genre_Drama
0,1,Toy Story (1995),0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0
1,2,Jumanji (1995),0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
4,5,Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3880,3950,Tigerland (2000),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3881,3951,Two Family House (2000),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
