# Data Cleaning and Preparation

In [14]:
import numpy as np
import pandas as pd


## Data Transformation (continue)
So far in this lesson we’ve been concerned with rearranging data. Filtering, cleaning,
and other transformations are another class of important operations.

### Detecting and Filtering Outliers
Filtering or transforming outliers is largely a matter of applying array operations.
Consider a DataFrame with some normally distributed data

In [28]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.011566,-0.051987,-0.030358,0.012831
std,1.010799,1.038626,0.994568,0.997572
min,-3.432242,-3.428033,-3.226173,-2.894898
25%,-0.705157,-0.724741,-0.669653,-0.697291
50%,0.059123,-0.01375,-0.034152,0.019323
75%,0.693937,0.663196,0.615788,0.703141
max,3.406387,3.475082,3.057017,3.290054


In [1]:
# find values in one of the columns exceeding 3 in absolute value


In [2]:
# select all rows having a value exceeding 3 or –3


In [3]:
# set outliers to 0


In [4]:
# set outliers to 3 or -3 depending on its sign


### Computing Indicator/Dummy Variables
Another type of transformation for statistical modeling or machine learning applica‐
tions is converting a categorical variable into a “dummy” or “indicator” matrix.

column in a DataFrame has k distinct values, you would derive a matrix or Data‐
Frame with k columns containing all 1s and 0s.

pandas has a `get_dummies` function for doing this

In [32]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [5]:
# create dummy variables for column 'key'


In [6]:
# create dummy variables for column 'key'  then add the columns to a variable


In [7]:
# join the dummies to the Data-Frame


In [19]:
# read the dataset 'movies.dat' and display the first 10 rows of it
labels = ["id", "title", "genre"]
movies = pd.read_csv("movies.dat", sep="::", names=labels, engine="python", encoding="ISO-8859-1")
movies

Unnamed: 0,id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


Adding dummy variables for each genre requires a little bit of wrangling.

In [21]:
# make a list 'genres' contains all distinct genres
genres = []
for g in movies["genre"]:
    genres.extend(g.split("|"))

genres = set(genres)
genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [22]:
# print the dataset dimentions
movies.shape

(3883, 3)

In [27]:
# create a DataFrame 'dummies' of size (#movies, #genres) and fill it with zeros
dummies = np.zeros((len(movies), len(genres)), dtype=int)
dummies = pd.DataFrame(dummies, columns=genres)
dummies

Unnamed: 0,Documentary,Film-Noir,Western,Animation,Drama,Horror,Adventure,Thriller,Mystery,Fantasy,Romance,Sci-Fi,Action,Comedy,Crime,Children's,War,Musical
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
# for each value of 'genre' column in 'movies' DataFrame, set the corresponding columns in dummies to 1 

for i, g in enumerate(movies["genre"]):
    cols = g.split("|")
    dummies.loc[i, cols] = 1

dummies

Unnamed: 0,Documentary,Film-Noir,Western,Animation,Drama,Horror,Adventure,Thriller,Mystery,Fantasy,Romance,Sci-Fi,Action,Comedy,Crime,Children's,War,Musical
0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3879,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
# join the DataFrames 'movies' and 'dummies', then display the first row 

movies.join(dummies).drop(columns="genre")

Unnamed: 0,id,title,Documentary,Film-Noir,Western,Animation,Drama,Horror,Adventure,Thriller,Mystery,Fantasy,Romance,Sci-Fi,Action,Comedy,Crime,Children's,War,Musical
0,1,Toy Story (1995),0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,2,Jumanji (1995),0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3879,3949,Requiem for a Dream (2000),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
