In [1]:
import pandas as pd

from numpy import nan as NA

data = pd.DataFrame([[1., 6.5, NA], [1., NA, NA], 
                     [NA, NA, NA], [NA, 6.5, NA]], 
                    columns = list('abc'))
display(data)

Unnamed: 0,a,b,c
0,1.0,6.5,
1,1.0,,
2,,,
3,,6.5,


In [3]:
cleaned = data.dropna()
print(cleaned)


Empty DataFrame
Columns: [a, b, c]
Index: []


In [5]:
data.dropna(axis=1)

0
1
2
3


In [6]:
data.dropna(axis=1, how='all')

Unnamed: 0,a,b
0,1.0,6.5
1,1.0,
2,,
3,,6.5


# Filling Data

In [18]:
import numpy as np
from numpy import nan as NA
df = pd.DataFrame(np.random.randn(7, 3))
print(df)

          0         1         2
0 -0.646187  1.177474 -0.343325
1  0.821714  0.386069  0.275360
2 -1.663727  0.138698  0.909204
3  0.599185 -0.917986  1.271980
4 -1.946061 -2.069333 -0.166796
5 -0.676695 -0.992281 -0.067949
6  0.500518  0.937242 -1.762021


In [19]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
display(df)

Unnamed: 0,0,1,2
0,-0.646187,,
1,0.821714,,
2,-1.663727,,0.909204
3,0.599185,,1.27198
4,-1.946061,-2.069333,-0.166796
5,-0.676695,-0.992281,-0.067949
6,0.500518,0.937242,-1.762021


In [22]:
df = df.fillna(method='ffill', limit=1, axis=1)
display(df)

Unnamed: 0,0,1,2
0,-0.646187,-0.646187,
1,0.821714,0.821714,
2,-1.663727,-1.663727,0.909204
3,0.599185,0.599185,1.27198
4,-1.946061,-2.069333,-0.166796
5,-0.676695,-0.992281,-0.067949
6,0.500518,0.937242,-1.762021


# Removing Duplicates

In [29]:
import pandas as pd

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]} )
display(data)
#data.duplicated()
#data = data.drop_duplicates()
#display(data)
data['v1'] = ['one', 'two', 'one', 'four', 'one', 'six', 'two']
#display(data)
display(data[ ['k1','v1'] ])
data = data.drop_duplicates(['k1', 'v1'])

display(data)

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


Unnamed: 0,k1,v1
0,one,one
1,two,two
2,one,one
3,two,four
4,one,one
5,two,six
6,two,two


Unnamed: 0,k1,k2,v1
0,one,1,one
1,two,1,two
3,two,3,four
5,two,4,six


# Replacing Values

In [36]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(7, 3))

df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
display(df)
df = df.replace(np.nan , -999)
display(df)
#df = df.replace(-999 , -9)
#display(df)
df =df.replace([-999, -9], [101, 0])
display(df)

Unnamed: 0,0,1,2
0,-1.943298,,
1,-0.614719,,
2,-0.877414,,-0.216403
3,0.812793,,0.136835
4,0.162372,1.085142,-1.278771
5,0.069276,-0.13074,0.382198
6,0.338554,-1.649288,-1.547176


Unnamed: 0,0,1,2
0,-1.943298,-999.0,-999.0
1,-0.614719,-999.0,-999.0
2,-0.877414,-999.0,-0.216403
3,0.812793,-999.0,0.136835
4,0.162372,1.085142,-1.278771
5,0.069276,-0.13074,0.382198
6,0.338554,-1.649288,-1.547176


Unnamed: 0,0,1,2
0,-1.943298,101.0,101.0
1,-0.614719,101.0,101.0
2,-0.877414,101.0,-0.216403
3,0.812793,101.0,0.136835
4,0.162372,1.085142,-1.278771
5,0.069276,-0.13074,0.382198
6,0.338554,-1.649288,-1.547176


# Renaming Axis Indexes using function mapping

In [43]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
        index=['Ohio', 'Colorado', 'New York'],
        columns=['one', 'two', 'three', 'four'])

display(data)
transform = lambda x: x[:3].upper()
data.index = data.index.map(transform)
display(data)

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Unnamed: 0,one,two,three,four
OHI,0,1,2,3
COL,4,5,6,7
NEW,8,9,10,11


# Detecting and Filtering Outliers

In [53]:
data = pd.DataFrame(np.random.randn(1000, 4))
display(data)
data.describe()
col = data[2]
print(col)
#col[np.abs(col) > 3]
#To select all rows having a value exceeding 3 or –3, 
#you can use the 'any' method on a boolean DataFrame:
#any method will check all cells 
outliers = data[(np.abs(data) > 3).any(1)]
print(len(outliers), len(data))
outliers
# check the difference
#print(data)
#data[(np.abs(data) > 3)]

Unnamed: 0,0,1,2,3
0,0.540726,-1.341072,-0.720748,1.731744
1,0.904662,-0.086353,-0.294544,0.701663
2,-0.045003,0.563637,-0.008360,-0.739138
3,0.265005,0.617927,1.476384,-0.271732
4,-0.028255,1.418587,0.317469,-0.522935
...,...,...,...,...
995,0.311424,1.937235,-1.056037,0.632671
996,-1.809305,-0.361002,0.044754,-0.102342
997,0.120669,0.850097,-0.975795,-0.927276
998,-0.115175,0.883381,-0.498329,-0.212158


0     -0.720748
1     -0.294544
2     -0.008360
3      1.476384
4      0.317469
         ...   
995   -1.056037
996    0.044754
997   -0.975795
998   -0.498329
999   -1.230924
Name: 2, Length: 1000, dtype: float64
16 1000


Unnamed: 0,0,1,2,3
31,-0.298674,-0.134115,-0.037332,3.402531
90,3.070199,0.499609,-0.129521,0.240245
151,-2.592894,1.072021,-3.926056,0.859316
174,-0.177514,0.053868,-2.556677,3.037351
251,-0.51897,2.370565,3.990804,-1.802572
378,-0.597187,-0.415367,0.032426,3.263834
400,-0.125711,-0.233912,-3.078153,0.149787
415,0.499638,-0.531077,0.488613,-3.071032
457,0.590821,0.19981,-2.146797,3.942426
565,-0.093483,-0.241118,-3.592735,-0.446043


# Permutation and Random Sampling

In [57]:

# Permuting (randomly reordering)
# a Series or the rows in a DataFrame
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(4)
print(sampler)
#df = df[sampler]
display(df)
df.take(sampler)

[1 3 0 2]


Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
0,0,1,2,3
2,8,9,10,11


# Regular Expressions

In [63]:
import re
text = "foo    bar\t baz \tqux"
#normal split = text.split()
#re.split('\s+', text) # spliting based on whitespaces
#text.split(" ")
#['foo', 'bar', 'baz', 'qux']
# compile once to use again and again and save time
rgx = re.compile('\s+')
rgx.split(text)

['foo', 'bar', 'baz', 'qux']