# Data prepare

## Missing data

In [57]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

In [58]:
data=Series(['hello', 'world', np.nan, 'one'])
data

0    hello
1    world
2      NaN
3      one
dtype: object

In [59]:
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [60]:
data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [61]:
data.dropna()

0    hello
1    world
3      one
dtype: object

In [62]:
data.fillna(1)

0    hello
1    world
2        1
3      one
dtype: object

In [63]:
data[data.notnull()]

0    hello
1    world
3      one
dtype: object

In [64]:
NA=np.nan
data=DataFrame([[1,2,3],[NA,4,5],[6,NA,7],[NA,NA,NA]])
data.dropna()

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [65]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,4.0,5.0
2,6.0,,7.0


In [66]:
data.dropna(axis=1)

0
1
2
3


In [67]:
data.dropna(axis=1,thresh=3)

Unnamed: 0,2
0,3.0
1,5.0
2,7.0
3,


In [68]:
data.fillna(1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,1.0,4.0,5.0
2,6.0,1.0,7.0
3,1.0,1.0,1.0


In [69]:
data.fillna({0:0,1:100,2:200})

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,0.0,4.0,5.0
2,6.0,100.0,7.0
3,0.0,100.0,200.0


In [70]:
data.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,1.0,4.0,5.0
2,6.0,4.0,7.0
3,6.0,4.0,7.0


In [71]:
data.fillna(data.mean())

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,3.5,4.0,5.0
2,6.0,3.0,7.0
3,3.5,3.0,5.0


## Data tranform

In [72]:
data=DataFrame([(1,1),(2,1),(1,1),(2,1),(3,4)])
data

Unnamed: 0,0,1
0,1,1
1,2,1
2,1,1
3,2,1
4,3,4


In [73]:
data.duplicated()

0    False
1    False
2     True
3     True
4    False
dtype: bool

In [74]:
data.drop_duplicates()

Unnamed: 0,0,1
0,1,1
1,2,1
4,3,4


In [75]:
data.drop_duplicates(0)

Unnamed: 0,0,1
0,1,1
1,2,1
4,3,4


In [76]:
data.drop_duplicates(1,keep='last')

Unnamed: 0,0,1
3,2,1
4,3,4


In [77]:
data=DataFrame([('WA',1), ('NY',2), ('CA',3)],columns=['state', 'pop'])
data

Unnamed: 0,state,pop
0,WA,1
1,NY,2
2,CA,3


In [78]:
location_map={'wa':'west','ca':'west','ny':'east'}
data['location']=data['state'].str.lower().map(location_map)
data

Unnamed: 0,state,pop,location
0,WA,1,west
1,NY,2,east
2,CA,3,west


In [79]:
data['state'].map(lambda x: location_map[x.lower()])

0    west
1    east
2    west
Name: state, dtype: object

In [80]:
data['state'].replace('WA', 'OR')

0    OR
1    NY
2    CA
Name: state, dtype: object

In [81]:
data['location'].replace(['west','east'], ['PDT', 'EST'])

0    PDT
1    EST
2    PDT
Name: location, dtype: object

In [82]:
data['location'].replace({'west':'PDT','east':'EST'})

0    PDT
1    EST
2    PDT
Name: location, dtype: object

In [83]:
data.index=pd.Index(data['state'])
data

Unnamed: 0_level_0,state,pop,location
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
WA,WA,1,west
NY,NY,2,east
CA,CA,3,west


In [84]:
data.index=data.index.map({'WA':'Washington', 'NY':'New York', 'CA':'California'})

In [85]:
data

Unnamed: 0_level_0,state,pop,location
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Washington,WA,1,west
New York,NY,2,east
California,CA,3,west


In [86]:
renamed=data.rename(index=str.title, columns=str.upper)
renamed

Unnamed: 0_level_0,STATE,POP,LOCATION
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Washington,WA,1,west
New York,NY,2,east
California,CA,3,west


In [87]:
data

Unnamed: 0_level_0,state,pop,location
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Washington,WA,1,west
New York,NY,2,east
California,CA,3,west


In [88]:
data.rename(index={'Washington':'Oregon'},inplace=True)
data

Unnamed: 0_level_0,state,pop,location
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Oregon,WA,1,west
New York,NY,2,east
California,CA,3,west


### Discretization

In [104]:
ages=[20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]
cats=pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [105]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [106]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [107]:
cats=pd.cut(ages, bins,right=False)
cats

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [108]:
pd.value_counts(cats)

[25, 35)     4
[18, 25)     4
[35, 60)     3
[60, 100)    1
dtype: int64

In [114]:
group_names=['Youth','YoungAdult','MiddleAged','Senior']
cats=pd.cut(ages,bins, labels=group_names)
cats

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [117]:
cats=pd.cut(ages,4,precision=0)
cats

[(20.0, 30.0], (20.0, 30.0], (20.0, 30.0], (20.0, 30.0], (20.0, 30.0], ..., (30.0, 40.0], (51.0, 61.0], (40.0, 51.0], (40.0, 51.0], (30.0, 40.0]]
Length: 12
Categories (4, interval[float64]): [(20.0, 30.0] < (30.0, 40.0] < (40.0, 51.0] < (51.0, 61.0]]

In [119]:
cats=pd.qcut(ages,4)
cats

[(19.999, 22.75], (19.999, 22.75], (22.75, 29.0], (22.75, 29.0], (19.999, 22.75], ..., (29.0, 38.0], (38.0, 61.0], (38.0, 61.0], (38.0, 61.0], (29.0, 38.0]]
Length: 12
Categories (4, interval[float64]): [(19.999, 22.75] < (22.75, 29.0] < (29.0, 38.0] < (38.0, 61.0]]

In [120]:
pd.value_counts(cats)

(38.0, 61.0]       3
(29.0, 38.0]       3
(22.75, 29.0]      3
(19.999, 22.75]    3
dtype: int64

### Abnormal

In [122]:
data=DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.070575,-0.019612,-0.011265,-0.006834
std,1.014041,1.002976,1.003821,0.945079
min,-2.884718,-4.436232,-3.57541,-3.199685
25%,-0.743857,-0.684355,-0.682644,-0.619981
50%,-0.073444,0.015026,-0.056601,-0.021707
75%,0.618868,0.63505,0.661869,0.596276
max,3.264015,3.231113,3.082247,3.079183


In [124]:
col=data[2]
col[np.abs(col)>=3]

61    -3.575410
801    3.082247
Name: 2, dtype: float64

In [126]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
61,-0.38739,-1.621329,-3.57541,-1.594834
159,-0.917518,0.798389,1.296392,3.079183
185,0.478653,-4.436232,-0.382177,-0.701577
208,3.264015,-1.652423,1.087089,-0.157943
801,-1.512864,-0.827172,3.082247,-1.269723
811,-0.087424,3.231113,0.62168,-0.20071
865,-0.380312,1.03095,0.632555,3.044828
925,-0.948077,-3.347111,-0.926291,-0.499344
960,0.7656,0.343667,-0.251132,-3.199685


In [129]:
data[np.abs(data)>3]=np.sign(data)*3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.070839,-0.01806,-0.010772,-0.006759
std,1.013206,0.995832,1.001689,0.944025
min,-2.884718,-3.0,-3.0,-3.0
25%,-0.743857,-0.684355,-0.682644,-0.619981
50%,-0.073444,0.015026,-0.056601,-0.021707
75%,0.618868,0.63505,0.661869,0.596276
max,3.0,3.0,3.0,3.0


In [131]:
data[np.abs(data)>=3].shape

(1000, 4)

In [137]:
ser=Series(np.arange(5))
ser

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [139]:
ser[ser>3].shape

(1,)

In [141]:
data=DataFrame(np.random.randint(20,size=(10,4)))
data

Unnamed: 0,0,1,2,3
0,15,17,5,2
1,0,1,9,16
2,12,4,7,11
3,10,1,3,2
4,4,3,3,13
5,14,9,2,5
6,0,5,19,6
7,0,11,14,15
8,6,16,0,2
9,13,7,16,9


In [144]:
data[data>2]=2*data[data>2]
data

Unnamed: 0,0,1,2,3
0,60,68,20,2
1,0,1,36,64
2,48,16,28,44
3,40,1,12,2
4,16,12,12,52
5,56,36,2,20
6,0,20,76,24
7,0,44,56,60
8,24,64,0,2
9,52,28,64,36


### Permutation and random sample

In [145]:
data=DataFrame(np.random.randint(20,size=(5,4)))
data

Unnamed: 0,0,1,2,3
0,15,10,3,12
1,17,2,3,14
2,15,11,10,14
3,2,4,4,7
4,13,13,19,9


In [146]:
perm=np.random.permutation(5)
perm

array([2, 3, 4, 1, 0])

In [147]:
data.take(perm)

Unnamed: 0,0,1,2,3
2,15,11,10,14
3,2,4,4,7
4,13,13,19,9
1,17,2,3,14
0,15,10,3,12


In [148]:
data.sample(3)

Unnamed: 0,0,1,2,3
3,2,4,4,7
1,17,2,3,14
4,13,13,19,9


In [150]:
data.sample(10,replace=True)

Unnamed: 0,0,1,2,3
2,15,11,10,14
4,13,13,19,9
3,2,4,4,7
2,15,11,10,14
2,15,11,10,14
3,2,4,4,7
1,17,2,3,14
2,15,11,10,14
4,13,13,19,9
2,15,11,10,14


In [155]:
pd.get_dummies(data[0],prefix='key')

Unnamed: 0,key_2,key_13,key_15,key_17
0,0,0,1,0
1,0,0,0,1
2,0,0,1,0
3,1,0,0,0
4,0,1,0,0


In [157]:
!head ../datasets/movielens/movies.dat

1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
6::Heat (1995)::Action|Crime|Thriller
7::Sabrina (1995)::Comedy|Romance
8::Tom and Huck (1995)::Adventure|Children's
9::Sudden Death (1995)::Action
10::GoldenEye (1995)::Action|Adventure|Thriller


In [156]:
mvnames=['movie_id', 'title','genres']
movies=pd.read_csv('../datasets/movielens/movies.dat', sep='::',header=None,names=mvnames)
movies

  


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [164]:
import itertools
genres_list=list(movies.genres.str.split('|'))
genres=pd.unique(list(itertools.chain(*genres_list)))
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [171]:
zero_matrix=np.zeros((len(movies), len(genres)), dtype=int)
genres_dict={x:i for i,x in enumerate(genres)}
for i, gs in enumerate(genres_list):
    for g in gs:
        zero_matrix[i][genres_dict[g]]=1
dummy=DataFrame(zero_matrix, columns=["Genres_"+x for x in genres])
dummy

Unnamed: 0,Genres_Animation,Genres_Children's,Genres_Comedy,Genres_Adventure,Genres_Fantasy,Genres_Romance,Genres_Drama,Genres_Action,Genres_Crime,Genres_Thriller,Genres_Horror,Genres_Sci-Fi,Genres_Documentary,Genres_War,Genres_Musical,Genres_Mystery,Genres_Film-Noir,Genres_Western
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
