# Data Cleaning and Preparation

In [106]:
import numpy as np
import pandas as pd


## Data Transformation (continue)
So far in this lesson we’ve been concerned with rearranging data. Filtering, cleaning,
and other transformations are another class of important operations.

### Detecting and Filtering Outliers
Filtering or transforming outliers is largely a matter of applying array operations.
Consider a DataFrame with some normally distributed data

In [107]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.005787,0.007485,0.029569,-0.03576
std,0.994635,1.007886,0.994092,0.969305
min,-3.329087,-2.977935,-3.201587,-2.958919
25%,-0.63685,-0.691312,-0.603495,-0.750547
50%,0.024673,-0.019937,0.048526,-0.040203
75%,0.647885,0.681272,0.67679,0.629381
max,3.812454,3.381737,3.467533,3.323671


In [108]:
# find values in one of the columns exceeding 3 in absolute value
col = data[1]
mask = np.abs(col) > 3
col[mask]

516    3.381737
705    3.339754
948    3.018030
Name: 1, dtype: float64

In [109]:
# select all rows having a value exceeding 3 or –3
mask = (np.abs(data) > 3).any(axis=1)
data[mask]

Unnamed: 0,0,1,2,3
106,3.812454,0.254525,-1.915925,0.164994
423,-3.329087,2.021475,0.621323,1.166465
516,1.421685,3.381737,1.389674,1.06947
531,-0.076855,-0.45181,-3.175672,1.489702
705,0.094036,3.339754,0.145814,-2.549573
816,-0.1813,-0.02565,1.100975,3.323671
907,-1.59049,1.599232,-3.201587,-1.286732
926,0.992098,0.639833,3.467533,-1.055083
948,0.372585,3.01803,2.208563,-1.577385


In [110]:
# set outliers to 3 or -3 depending on its sign
mask = np.abs(data) > 3
data[mask] = np.sign(data) * 3

In [111]:
data[35:]

Unnamed: 0,0,1,2,3
35,-0.476068,0.443990,-1.270652,0.946079
36,-0.447172,0.554122,0.611562,-0.945448
37,1.311491,1.226877,-0.196739,-1.376741
38,1.419193,-0.571134,-0.717735,-0.171443
39,1.102910,0.347405,0.890847,-0.761172
...,...,...,...,...
995,0.452047,0.157987,-1.390616,-0.861191
996,-1.810967,-1.485777,0.407328,0.293082
997,-0.627459,-2.415937,-0.412913,0.401551
998,-0.989420,0.601556,0.712667,-0.747706


### Computing Indicator/Dummy Variables
Another type of transformation for statistical modeling or machine learning applica‐
tions is converting a categorical variable into a “dummy” or “indicator” matrix.

column in a DataFrame has k distinct values, you would derive a matrix or Data‐
Frame with k columns containing all 1s and 0s.

pandas has a `get_dummies` function for doing this

In [112]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [113]:
# create dummy variables for column 'key'
pd.get_dummies(df["key"], prefix="key")

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [114]:
# create dummy variables for column 'key'  then add the columns to a variable
dummies = pd.get_dummies(df["key"], prefix="key")
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [120]:
# join the dummies to the Data-Frame
df.join(dummies).drop(columns="key")

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [78]:
# read the dataset 'movies.dat' and display the first 10 rows of it

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_csv('movies.dat', sep='::', engine="python",
                       header=None, names=mnames)
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


Adding dummy variables for each genre requires a little bit of wrangling.

In [81]:
# make a list 'genres' contains all distict genres
all_genres = []
for g in movies["genres"]:
    all_genres.extend(g.split("|"))
    
all_genres = set(all_genres)
all_genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [87]:
movies.shape

(3883, 3)

In [91]:
# create a DataFrame 'dummies' of size (#movies, #genres) and fill it with zeros
zero_matrix = np.zeros((len(movies), len(all_genres)), dtype=int)
dummies = pd.DataFrame(zero_matrix, columns=all_genres)
dummies

Unnamed: 0,Romance,Film-Noir,Western,Fantasy,Thriller,Comedy,Horror,Documentary,Mystery,War,Sci-Fi,Drama,Musical,Action,Crime,Animation,Children's,Adventure
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [99]:
# get the genres from the first row of 'movies' DataFrame, then get their indices in the 'dummies' DataFrame
# hint: use 'get_indexer' method
g = movies.loc[0, "genres"]
print(g.split("|"))
dummies.columns.get_indexer(g.split("|"))

['Animation', "Children's", 'Comedy']


array([15, 16,  5], dtype=int64)

In [100]:
# for each value of 'genre' column in 'movies' DataFrame, set the corresponding columns in dummies to 1 

for i, g in enumerate(movies["genres"]):
#     idxs = dummies.columns.get_indexer(g.split("|"))
#     dummies.iloc[i, idxs] = 1
    dummies.loc[i, g.split("|")] = 1
    
dummies

Unnamed: 0,Romance,Film-Noir,Western,Fantasy,Thriller,Comedy,Horror,Documentary,Mystery,War,Sci-Fi,Drama,Musical,Action,Crime,Animation,Children's,Adventure
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [103]:
# join the DataFrames 'movies' and 'dummies', then display the first row 
movies.join(dummies).drop(columns="genres")

Unnamed: 0,movie_id,title,Romance,Film-Noir,Western,Fantasy,Thriller,Comedy,Horror,Documentary,Mystery,War,Sci-Fi,Drama,Musical,Action,Crime,Animation,Children's,Adventure
0,1,Toy Story (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
1,2,Jumanji (1995),0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,3,Grumpier Old Men (1995),1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3880,3950,Tigerland (2000),0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3881,3951,Two Family House (2000),0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
