In [1]:
import numpy as np

In [2]:
import pandas as pd

# Detecting and Filtering Outliers

In [3]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,-0.875470,-1.280625,-0.491819,0.630661
1,-0.222170,2.474733,-1.510974,0.478630
2,-0.125663,-0.083776,-1.095799,1.714650
3,0.604469,-0.301644,0.918557,1.970179
4,-0.844798,-1.584552,0.063084,0.301470
...,...,...,...,...
995,0.098586,0.808387,0.147675,0.183739
996,0.889623,0.961768,-0.612271,-0.943964
997,-0.043645,0.999553,1.417123,0.982042
998,-0.029378,-1.655781,1.070369,0.681964


In [4]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.014012,-0.045106,0.063563,0.006243
std,1.027043,1.004026,1.008323,0.955563
min,-3.463681,-3.567734,-2.966605,-3.253091
25%,-0.694376,-0.739658,-0.61185,-0.661106
50%,-0.009138,-0.043151,0.011758,0.023742
75%,0.712791,0.620546,0.731759,0.639425
max,3.637631,3.417628,3.236005,2.83173


In [5]:
data[(np.abs(data) > 3).any(1)]   #

Unnamed: 0,0,1,2,3
342,3.010303,-0.834238,0.728393,-0.369303
344,-0.492931,1.035737,3.098652,-0.352678
370,-0.398143,-1.087128,3.236005,-1.819872
401,-3.463681,-1.834035,0.583098,1.328439
499,-3.339823,-0.168363,-0.266958,-1.204883
520,-3.034195,-1.231849,0.456325,0.414652
572,-0.167902,-3.567734,-0.170758,-0.308749
771,-1.153798,3.417628,0.028289,0.636794
789,3.637631,1.441997,0.12783,-0.271207
821,-0.312409,0.438622,-0.321543,-3.253091


In [6]:
np.sign(data) * 3

Unnamed: 0,0,1,2,3
0,-3.0,-3.0,-3.0,3.0
1,-3.0,3.0,-3.0,3.0
2,-3.0,-3.0,-3.0,3.0
3,3.0,-3.0,3.0,3.0
4,-3.0,-3.0,3.0,3.0
...,...,...,...,...
995,3.0,3.0,3.0,3.0
996,3.0,3.0,-3.0,-3.0
997,-3.0,3.0,3.0,3.0
998,-3.0,-3.0,3.0,3.0


In [7]:
data[np.abs(data) > 3] = np.sign(data) * 3

In [8]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.013542,-0.044956,0.063228,0.006496
std,1.021472,1.000833,1.007315,0.954732
min,-3.0,-3.0,-2.966605,-3.0
25%,-0.694376,-0.739658,-0.61185,-0.661106
50%,-0.009138,-0.043151,0.011758,0.023742
75%,0.712791,0.620546,0.731759,0.639425
max,3.0,3.0,3.0,2.83173


# Permutation and Random Sampling

In [9]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [39]:
sampler = np.random.permutation(5)
sampler

array([4, 1, 2, 0, 3])

In [40]:
df.take(sampler)

Unnamed: 0,0,1,2,3
4,16,17,18,19
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15


In [41]:
df.take([1,2,3,4,0])

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3


In [44]:
df.take(np.random.permutation(5))

Unnamed: 0,0,1,2,3
4,16,17,18,19
0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
2,8,9,10,11


In [46]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3


# Computing Indicator/Dummy Variables

In [49]:
df = pd.DataFrame({"key": ["a", "b", "c","c","a","b"],"Data1":range(6)})

In [50]:
df

Unnamed: 0,key,Data1
0,a,0
1,b,1
2,c,2
3,c,3
4,a,4
5,b,5


In [51]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,0,0,1
4,1,0,0
5,0,1,0


In [52]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [53]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,1,0,0
1,0,1,0
2,0,0,1
3,0,0,1
4,1,0,0
5,0,1,0


In [55]:
df_with_dummy = df[['Data1']].join(dummies)

In [56]:
df_with_dummy

Unnamed: 0,Data1,key_a,key_b,key_c
0,0,1,0,0
1,1,0,1,0
2,2,0,0,1
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


# String Manipulation

In [57]:
val = 'a,b, guido'

In [58]:
val

'a,b, guido'

In [59]:
val.split(',')

['a', 'b', ' guido']

In [60]:
pieces = [x.strip() for x in val.split(',')]

In [61]:
pieces

['a', 'b', 'guido']

In [62]:
first, second, third = pieces

In [63]:
first + '::' + second + '::' + third

'a::b::guido'

In [64]:
'::'.join(pieces)

'a::b::guido'

In [65]:
val.count(',')

2

In [69]:
val.replace(',', '')

'ab guido'

In [70]:
val.replace('',",")

',a,,,b,,, ,g,u,i,d,o,'

In [71]:
#val.find


# Regular Expression

In [72]:
import re

In [73]:
text = "foo  bar\t baz\tqux"

In [74]:
text

'foo  bar\t baz\tqux'

In [75]:
re.split('\s+',text)

['foo', 'bar', 'baz', 'qux']

In [76]:
re.split('\s',text)

['foo', '', 'bar', '', 'baz', 'qux']

In [77]:
regex = re.compile('\s+')

In [78]:
regex

re.compile(r'\s+', re.UNICODE)

In [79]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [80]:
text

'Dave dave@google.com\nSteve steve@gmail.com\nRob rob@gmail.com\nRyan ryan@yahoo.com\n'

In [81]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [82]:
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [83]:
regex

re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.IGNORECASE|re.UNICODE)

In [84]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [85]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [86]:
m = regex.search(text)

In [87]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>