In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.Series([1, -999, 2, -999, -1000, 3])

data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [4]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [5]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [6]:
data.replace({-999:np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [7]:
data = pd.DataFrame(np.arange(12).reshape(3,4), 
                    index = ['Ohio', 'Colorado', 'New York'],
                    columns = ['one', 'two', 'three', 'four'])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [8]:
transform = lambda x: x[:4].upper()

data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [9]:
data.rename(index = str.title, columns = str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [13]:
data.rename(index = {'Ohio':'INDIANA'}, 
            columns = {'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [16]:
data.rename(index = data.index.upper())

AttributeError: 'function' object has no attribute 'upper'

In [17]:
data.rename(index = {'Ohio':'INDIANA'}, inplace = True)

In [25]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)

print(cats)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]


In [21]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [22]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [23]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [40]:
cats = pd.cut(ages, [18, 26, 36, 61, 100], right = False)

cats

len(cats)

12

In [41]:
pd.value_counts(cats)

[18, 26)     5
[36, 61)     3
[26, 36)     3
[61, 100)    1
dtype: int64

In [43]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

pd.cut(ages, bins, labels = group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [48]:
data = np.random.rand(20)

print(pd.cut(data, 4, precision = 2, right = False))

pd.value_counts(pd.cut(data, 4, precision = 2, right = False))

[[0.75, 0.98), [0.068, 0.3), [0.3, 0.52), [0.52, 0.75), [0.068, 0.3), ..., [0.52, 0.75), [0.068, 0.3), [0.75, 0.98), [0.068, 0.3), [0.75, 0.98)]
Length: 20
Categories (4, interval[float64]): [[0.068, 0.3) < [0.3, 0.52) < [0.52, 0.75) < [0.75, 0.98)]


[0.068, 0.3)    8
[0.75, 0.98)    6
[0.52, 0.75)    3
[0.3, 0.52)     3
dtype: int64

In [50]:
print(pd.qcut(data, 4, precision = 2))

pd.value_counts(pd.qcut(data, 4, precision = 2))

[(0.8, 0.98], (0.058, 0.19], (0.19, 0.42], (0.42, 0.8], (0.19, 0.42], ..., (0.42, 0.8], (0.19, 0.42], (0.8, 0.98], (0.058, 0.19], (0.8, 0.98]]
Length: 20
Categories (4, interval[float64]): [(0.058, 0.19] < (0.19, 0.42] < (0.42, 0.8] < (0.8, 0.98]]


(0.8, 0.98]      5
(0.42, 0.8]      5
(0.19, 0.42]     5
(0.058, 0.19]    5
dtype: int64

In [51]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])

[(0.962, 0.981], (0.06709999999999999, 0.127], (0.127, 0.424], (0.424, 0.962], (0.127, 0.424], ..., (0.424, 0.962], (0.127, 0.424], (0.962, 0.981], (0.06709999999999999, 0.127], (0.424, 0.962]]
Length: 20
Categories (4, interval[float64]): [(0.06709999999999999, 0.127] < (0.127, 0.424] < (0.424, 0.962] < (0.962, 0.981]]

In [52]:
pd.value_counts(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1]))

(0.424, 0.962]                  8
(0.127, 0.424]                  8
(0.962, 0.981]                  2
(0.06709999999999999, 0.127]    2
dtype: int64

In [54]:
data = pd.DataFrame(np.random.randn(1000,4))

pd.options.display.max_rows = 10

data

Unnamed: 0,0,1,2,3
0,-0.256662,0.337390,0.442275,0.270187
1,1.057606,-1.196689,-0.174729,-0.946476
2,0.189252,2.194760,-0.500692,-0.449465
3,-0.726542,-1.510722,-1.062532,0.393359
4,0.207947,-0.937495,-0.072535,-0.846478
...,...,...,...,...
995,-0.519990,0.465311,1.623181,-0.486936
996,1.348218,-0.148070,0.448561,-0.109219
997,-1.027034,-1.624593,-0.837714,0.179623
998,-1.150136,0.045138,0.432366,0.493314


In [55]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.023337,-0.00303,0.032815,0.049732
std,1.014206,1.024381,0.981454,1.007293
min,-2.872888,-3.189139,-2.822807,-2.562009
25%,-0.693715,-0.733459,-0.613053,-0.619697
50%,-0.036581,0.000252,-0.019933,0.062063
75%,0.701739,0.728485,0.680098,0.764533
max,3.526841,3.075371,2.755668,2.999162


In [58]:
col = data[2]

col[np.abs(col) > 2]

16     2.405296
21    -2.048047
34     2.373251
61     2.061599
70     2.555744
         ...   
929    2.184742
938    2.064771
948   -2.652709
956    2.419746
957   -2.259039
Name: 2, Length: 45, dtype: float64

In [63]:
data[(np.abs(data) > 2).any(1)]

Unnamed: 0,0,1,2,3
2,0.189252,2.194760,-0.500692,-0.449465
9,-0.194435,0.484686,-0.678546,-2.562009
12,-1.783872,-0.703909,0.573184,2.277367
16,-2.021463,-1.574613,2.405296,-0.865287
17,-0.271315,-2.108812,-0.076377,-1.597330
...,...,...,...,...
985,0.692186,2.154079,0.180447,-0.631302
986,-0.110552,-1.119129,1.535497,2.012819
987,2.436544,1.746191,-1.868375,0.832306
989,0.854059,-2.083086,1.477231,0.308626


In [64]:
data[(np.abs(data) > 2)]

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,2.19476,,
3,,,,
4,,,,
...,...,...,...,...
995,,,,
996,,,,
997,,,,
998,,,,


In [68]:
data[np.abs(data) > 2] = np.sign(data)*3

print(np.sign(data))

data.head()

       0    1    2    3
0   -1.0  1.0  1.0  1.0
1    1.0 -1.0 -1.0 -1.0
2    1.0  1.0 -1.0 -1.0
3   -1.0 -1.0 -1.0  1.0
4    1.0 -1.0 -1.0 -1.0
..   ...  ...  ...  ...
995 -1.0  1.0  1.0 -1.0
996  1.0 -1.0  1.0 -1.0
997 -1.0 -1.0 -1.0  1.0
998 -1.0  1.0  1.0  1.0
999  1.0 -1.0  1.0  1.0

[1000 rows x 4 columns]


Unnamed: 0,0,1,2,3
0,-0.256662,0.33739,0.442275,0.270187
1,1.057606,-1.196689,-0.174729,-0.946476
2,0.189252,3.0,-0.500692,-0.449465
3,-0.726542,-1.510722,-1.062532,0.393359
4,0.207947,-0.937495,-0.072535,-0.846478


In [69]:
df = pd.DataFrame(np.arange(5*4).reshape(5,4))

df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [70]:
sampler = np.random.permutation(5)

sampler

array([1, 2, 3, 0, 4])

In [71]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
0,0,1,2,3
4,16,17,18,19


In [84]:
df[[],[sampler]]

TypeError: unhashable type: 'list'

In [86]:
df.sample(n = 4)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
2,8,9,10,11
3,12,13,14,15


In [92]:
choices = pd.Series([5, 7, -1, 6, 4])

draws = choices.sample(n = 10, replace = True)

draws

2   -1
1    7
1    7
4    4
1    7
3    6
2   -1
1    7
4    4
2   -1
dtype: int64

In [96]:
df = pd.DataFrame({'key': list('bbacabdf'), 'data1': range(8)})

df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5
6,d,6
7,f,7


In [97]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c,d,f
0,0,1,0,0,0
1,0,1,0,0,0
2,1,0,0,0,0
3,0,0,1,0,0
4,1,0,0,0,0
5,0,1,0,0,0
6,0,0,0,1,0
7,0,0,0,0,1


In [101]:
dummies = pd.get_dummies(df['key'], prefix = 'key')

df_with_dummy = df[['data1']].join(dummies)

df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c,key_d,key_f
0,0,0,1,0,0,0
1,1,0,1,0,0,0
2,2,1,0,0,0,0
3,3,0,0,1,0,0
4,4,1,0,0,0,0
5,5,0,1,0,0,0
6,6,0,0,0,1,0
7,7,0,0,0,0,1


In [103]:
mnames = ['movie_id', 'title', 'genres']

movies = pd.read_table('datasets/movielens/movies.dat', sep = '::',
                      header = None, names = mnames)

movies[:10]

  after removing the cwd from sys.path.


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [104]:
all_genres = []

for x in movies.genres:
    all_genres.extend(x.split('|'))
    
genres = pd.unique(all_genres)

genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [106]:
zero_matrix = np.zeros((len(movies), len(genres)))

dummies = pd.DataFrame(zero_matrix, columns = genres)

gen = movies.genres[0]

gen.split('|')

dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

In [107]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1
    
movies_windic = movies.join(dummies.add_prefix('Genre_'))

movies_windic.iloc[0]

movie_id                                      1
title                          Toy Story (1995)
genres              Animation|Children's|Comedy
Genre_Animation                               1
Genre_Children's                              1
                               ...             
Genre_War                                     0
Genre_Musical                                 0
Genre_Mystery                                 0
Genre_Film-Noir                               0
Genre_Western                                 0
Name: 0, Length: 21, dtype: object

In [111]:
np.random.seed(12345)

values = np.random.rand(10)

values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [112]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [120]:
val = 'a , b, guido'

val.split(',')

['a ', ' b', ' guido']

In [121]:
pieces = [x.strip() for x in val.split(',')]

pieces

['a', 'b', 'guido']

In [123]:
first, second, third = pieces

first + '::' + second + '::' + third

'a::b::guido'

In [124]:
'::'.join(pieces)

'a::b::guido'

In [125]:
'guido' in val

True

In [136]:
val.index('o')

11

In [138]:
val.find(',')

2

In [134]:
val.count(',')

2

In [139]:
val.replace(',', '::')

'a :: b:: guido'

In [140]:
val.replace(',', '')

'a  b guido'

In [141]:
import re

text = 'foo  bar\t baz  \tqux'

re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [142]:
regex = re.compile('\s+')

regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [143]:
regex.findall(text)

['  ', '\t ', '  \t']

In [145]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [146]:
m = regex.search(text)

m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [147]:
text[m.start():m.end()]

'dave@google.com'

In [149]:
print(regex.match(text))

None


In [150]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [152]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

regex = re.compile(pattern, flags = re.IGNORECASE)

m = regex.match('wesm@bright.net')

print(m)

m.groups()

<re.Match object; span=(0, 15), match='wesm@bright.net'>


('wesm', 'bright', 'net')

In [153]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [155]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [156]:
regex.finditer(text)

<callable_iterator at 0x2503e375cf8>

In [159]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 
        'Rob': 'rob@gmail.com', 'Wes': np.nan}

data = pd.Series(data)

data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [160]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [161]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [162]:
data.str.findall(pattern, flags = re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [164]:
matches = data.str.match(pattern, flags = re.IGNORECASE)

matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [165]:
matches.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [166]:
matches.str[0]

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [168]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object