# Detecting and filtering outliers

In [1]:
import pandas as pd
import numpy as np

data = pd.DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.006385,0.003612,0.024197,0.043428
std,0.971689,0.987155,0.997445,1.001741
min,-3.340681,-3.380841,-2.930793,-3.299162
25%,-0.62809,-0.654366,-0.674583,-0.615567
50%,0.006415,0.018969,0.011412,-0.032481
75%,0.67566,0.70572,0.72552,0.735531
max,3.247771,3.355811,3.172105,3.07136


In [3]:
col = data[2]
col[np.abs(col) > 3]

24    -3.164320
285   -3.540731
Name: 2, dtype: float64

In [4]:
data[(np.abs(data) > 3).any(1)] # select all rows having a value exceeding 3 or -3

Unnamed: 0,0,1,2,3
10,-0.578646,-1.341482,0.160332,3.113198
24,-0.318139,0.657396,-3.16432,-0.623102
39,-1.270571,2.399821,-0.669597,-3.250863
99,-0.093234,-0.4247,-0.793356,3.388988
185,-2.361329,2.140363,0.315351,-3.10327
285,-1.441134,0.525334,-3.540731,0.190594
329,-1.323743,-0.197753,-0.05578,3.182104
423,0.094933,3.354647,0.977513,-0.403914
454,0.264835,-1.126876,0.891317,3.455876
506,2.063574,1.480998,0.202908,-3.11952


In [6]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.039944,0.043565,-0.027802,-0.008371
std,0.972227,0.968147,0.991294,1.047709
min,-3.0,-2.521595,-3.0,-3.0
25%,-0.70211,-0.636073,-0.70892,-0.682134
50%,-0.032719,0.041197,0.003392,0.013288
75%,0.64122,0.705895,0.630551,0.690692
max,3.0,3.0,2.633441,3.0


# Permutation and random sampling

In [7]:
df = pd.DataFrame(np.arange(5*4).reshape((5,4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [8]:
sampler = np.random.permutation(5)  # used in iloc-based indexing or "take" function
sampler

array([1, 4, 2, 0, 3])

In [9]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15


In [10]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
0,0,1,2,3


In [11]:
choices = pd.Series([5,7,-1,6,4])
draws = choices.sample(n=10, replace=True) # allow repeat samples
draws

0    5
2   -1
4    4
4    4
3    6
2   -1
3    6
1    7
3    6
0    5
dtype: int64

# Indicator or Dummy variables

In [3]:
# convert a categorical variable into a dummy or indicator matrix
df = pd.DataFrame({'key':['b','b','a','c','a','b'],
                   'data1': range(6)})
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [4]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [5]:
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [7]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [8]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


# String manipulation

In [9]:
val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

In [11]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [12]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [13]:
'::'.join(pieces)

'a::b::guido'

In [14]:
'guido' in val 

True

In [15]:
# locating substring
val.index(',')

1

In [16]:
val.find(',')

1

In [17]:
val.count(',')

2

In [18]:
val.replace(',','::')

'a::b::  guido'

In [19]:
val.replace(',','') # by passing an empty string

'ab  guido'

In [20]:
# regular expression

In [21]:
import re
text = "foo    bar\t baz   \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [22]:
regex = re.compile('\s+') # use re.compile to create a regex object, here '\s+' is a regex object
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [23]:
regex.findall(text)

['    ', '\t ', '   \t']

In [24]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}]'

regex = re.compile(pattern, flags=re.IGNORECASE)

In [26]:
regex.findall(text)

[]

In [27]:
m = regex.search(text)

In [28]:
print(regex.sub('Redacted', text))

Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com



In [2]:
# Vectorized string functions
data = {'Dave': 'dave@google.com','Steve':'steve@gmail.com', 'Rob':'rob@gmail.com','Wes':np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

In [3]:
data.str.contains('gmail')

Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object