In [1]:
import numpy as np
import pandas as pd

# Filtering outliers, Sampling and Dummy Variables

## What is an outlier?

An Outlier is an observation that lies an abnormal distance from other observation.

In [3]:
data = pd.read_csv('data/ex2.csv')
data.head(10)

Unnamed: 0,Income,Rating,Cards,Age
0,19.225,122,3,38
1,43.54,232,4,69
2,152.298,828,4,41
3,55.367,448,1,33
4,11.741,182,44,59
5,15.56,352,4,57
6,59.53,543,3,52
7,20.191,431,4,42
8,48.498,456,3,47
9,30.733,249,4,51


In [4]:
data.describe()

Unnamed: 0,Income,Rating,Cards,Age
count,20.0,20.0,20.0,20.0
mean,39.6811,352.7,7.6,51.05
std,33.482958,179.936568,14.485565,12.94716
min,11.741,120.0,1.0,26.0
25%,17.60175,238.75,2.0,41.75
50%,28.4,309.0,3.0,49.0
75%,50.21525,450.0,4.0,59.0
max,152.298,828.0,55.0,74.0


From above results we can clearly see that some values are abnormal may be due to type error.


In [7]:
(data['Cards'] > 5).any()

True

In [8]:
data[data['Cards'] > 5]

Unnamed: 0,Income,Rating,Cards,Age
4,11.741,182,44,59
12,14.084,120,55,46


In [10]:
data.loc[4,['Cards']] = 4
data.loc[12,['Cards']] = 5

In [12]:
data.head(10)

Unnamed: 0,Income,Rating,Cards,Age
0,19.225,122,3,38
1,43.54,232,4,69
2,152.298,828,4,41
3,55.367,448,1,33
4,11.741,182,4,59
5,15.56,352,4,57
6,59.53,543,3,52
7,20.191,431,4,42
8,48.498,456,3,47
9,30.733,249,4,51


# Sampling 

In [17]:
data2= pd.read_csv('data/ex3.csv')
data2.shape

(3000, 7)

In [20]:
data2.sample(frac = 0.5).reset_index(drop=True) #frac 1 means 100% data or (n = 100) means extracing 100 rows 

Unnamed: 0,year,age,sex,maritl,race,education,wage
0,2005,33,1. Male,1. Never Married,2. Black,2. HS Grad,73.775743
1,2004,46,1. Male,2. Married,1. White,4. College Grad,185.283317
2,2003,36,1. Male,2. Married,1. White,2. HS Grad,115.375039
3,2003,38,1. Male,2. Married,1. White,2. HS Grad,118.712094
4,2009,43,1. Male,2. Married,3. Asian,4. College Grad,156.503511
...,...,...,...,...,...,...,...
1495,2006,43,1. Male,2. Married,1. White,1. < HS Grad,95.230713
1496,2005,30,1. Male,2. Married,2. Black,3. Some College,104.921507
1497,2004,36,1. Male,2. Married,1. White,2. HS Grad,76.893603
1498,2009,28,1. Male,2. Married,3. Asian,3. Some College,68.748088


# Dummy variables 

Categorical varaibles need to be convert into dummy variables to be used for statistical modeling or machine learning models.

In [21]:
data3 = pd.read_csv('data/ex4.csv')
data3

Unnamed: 0,year,age,sex,marital,race,education,wage
0,2006,18,Male,Never Married,White,< HS Grad,75.043154
1,2004,24,Male,Never Married,White,College Grad,70.47602
2,2003,45,Male,Married,Black,Some College,130.982177
3,2003,43,Female,Married,Asian,College Grad,154.685293
4,2005,50,Male,Divorced,White,HS Grad,75.043154
5,2008,54,Male,Married,White,College Grad,127.115744
6,2009,44,Female,Married,White,Some College,169.528538
7,2008,30,Male,Never Married,Asian,Some College,111.720849
8,2006,41,Female,Never Married,Black,Some College,118.884359
9,2004,52,Male,Married,White,HS Grad,128.680488


In [23]:
pd.get_dummies(data3['marital'])

Unnamed: 0,Divorced,Married,Never Married
0,0,0,1
1,0,0,1
2,0,1,0
3,0,1,0
4,1,0,0
5,0,1,0
6,0,1,0
7,0,0,1
8,0,0,1
9,0,1,0


In [24]:
gender_dummies = pd.get_dummies(data3['sex'])
gender_dummies

Unnamed: 0,Male,Female
0,1,0
1,1,0
2,1,0
3,0,1
4,1,0
5,1,0
6,0,1
7,1,0
8,0,1
9,1,0


In [25]:
data3_with_dummies = data3.join(gender_dummies)

In [26]:
data3_with_dummies

Unnamed: 0,year,age,sex,marital,race,education,wage,Male,Female
0,2006,18,Male,Never Married,White,< HS Grad,75.043154,1,0
1,2004,24,Male,Never Married,White,College Grad,70.47602,1,0
2,2003,45,Male,Married,Black,Some College,130.982177,1,0
3,2003,43,Female,Married,Asian,College Grad,154.685293,0,1
4,2005,50,Male,Divorced,White,HS Grad,75.043154,1,0
5,2008,54,Male,Married,White,College Grad,127.115744,1,0
6,2009,44,Female,Married,White,Some College,169.528538,0,1
7,2008,30,Male,Never Married,Asian,Some College,111.720849,1,0
8,2006,41,Female,Never Married,Black,Some College,118.884359,0,1
9,2004,52,Male,Married,White,HS Grad,128.680488,1,0
