In [1]:
import numpy as np
import pandas as pd

### in Pandas invalid or missing values are assigned NaN

In [2]:
obj = pd.Series([1, np.nan, 3, 7, np.nan])
obj

0    1.0
1    NaN
2    3.0
3    7.0
4    NaN
dtype: float64

### using dropna to discard NaN values

In [3]:
obj.dropna()

0    1.0
2    3.0
3    7.0
dtype: float64

In [4]:
obj[obj.notnull()]

0    1.0
2    3.0
3    7.0
dtype: float64

In [5]:
data = pd.DataFrame([
    [1,np.nan,3],
    [4,np.nan,6],
    [7,8,np.nan],
])
data

Unnamed: 0,0,1,2
0,1,,3.0
1,4,,6.0
2,7,8.0,


### to drop all rows and columns containing NaN

In [6]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2


In [7]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1,,3.0
1,4,,6.0
2,7,8.0,


In [8]:
data.dropna(how='any')

Unnamed: 0,0,1,2


In [9]:
data

Unnamed: 0,0,1,2
0,1,,3.0
1,4,,6.0
2,7,8.0,


### filling the missing data values with a given value

In [10]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1,0.0,3.0
1,4,0.0,6.0
2,7,8.0,0.0


### dropping duplicates

In [11]:
data = pd.DataFrame({
    'k1':[1,1,2,2,3,3,4,5,6],
    'k2':[1,1,2,2,2,3,4,5,9],
})
data

Unnamed: 0,k1,k2
0,1,1
1,1,1
2,2,2
3,2,2
4,3,2
5,3,3
6,4,4
7,5,5
8,6,9


In [12]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,1,1
2,2,2
4,3,2
5,3,3
6,4,4
7,5,5
8,6,9


In [13]:
data.duplicated()

0    False
1     True
2    False
3     True
4    False
5    False
6    False
7    False
8    False
dtype: bool

### adding extra column using map functions

In [14]:
data = pd.DataFrame({
    'food':['bacon','pork','bacon','postrami','beef','Bacon','Pork','ham','lox'],
    'ounces':[4,3,12,6,8,9,12,10,3],
})

In [15]:
data

Unnamed: 0,food,ounces
0,bacon,4
1,pork,3
2,bacon,12
3,postrami,6
4,beef,8
5,Bacon,9
6,Pork,12
7,ham,10
8,lox,3


In [16]:
meat_to_animal={
    'bacon':'pig',
    'pork':'pig',
    'postrami':'cow',
    'beef':'cow',
    'ham':'pig',
    'lox':'salmon',
}

### some food is capitalized so lets lowercase

In [17]:
lowercased = data['food'].str.lower()
lowercased

0       bacon
1        pork
2       bacon
3    postrami
4        beef
5       bacon
6        pork
7         ham
8         lox
Name: food, dtype: object

In [18]:
data['animal'] = lowercased.map(meat_to_animal)

In [19]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4,pig
1,pork,3,pig
2,bacon,12,pig
3,postrami,6,cow
4,beef,8,cow
5,Bacon,9,pig
6,Pork,12,pig
7,ham,10,pig
8,lox,3,salmon


In [20]:
data['food'].map(lambda x:meat_to_animal[x.lower()])
data

Unnamed: 0,food,ounces,animal
0,bacon,4,pig
1,pork,3,pig
2,bacon,12,pig
3,postrami,6,cow
4,beef,8,cow
5,Bacon,9,pig
6,Pork,12,pig
7,ham,10,pig
8,lox,3,salmon


In [21]:
data = pd.DataFrame([
    [1,np.nan,3],
    [4,np.nan,6],
    [7,8,np.nan],
])
data

Unnamed: 0,0,1,2
0,1,,3.0
1,4,,6.0
2,7,8.0,


In [22]:
data1 = data.replace(np.nan, -999)
data1

Unnamed: 0,0,1,2
0,1,-999.0,3.0
1,4,-999.0,6.0
2,7,8.0,-999.0


In [23]:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
                   'B': [5, 6, 7, 8, 9],
                   'C': ['a', 'b', 'c', 'd', 'e']})
df

Unnamed: 0,A,B,C
0,0,5,a
1,1,6,b
2,2,7,c
3,3,8,d
4,4,9,e


In [24]:
df.replace(5, np.nan)

Unnamed: 0,A,B,C
0,0,,a
1,1,6.0,b
2,2,7.0,c
3,3,8.0,d
4,4,9.0,e


### Discretization and Binning

In [25]:
ages = [19,21,23,27,32,33,37,41,45,49,53,55,56,60,61,67,72]

### dividing the ages into groups
#### 18-25, 25-40, 40-55, above 55

In [29]:
bins = [18,25,40,55,100]

### using pd.cut function to achieve category

In [30]:
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 40], (25, 40], ..., (55, 100], (55, 100], (55, 100], (55, 100], (55, 100]]
Length: 17
Categories (4, interval[int64]): [(18, 25] < (25, 40] < (40, 55] < (55, 100]]

### if category not specified, gives NaN values

In [31]:
cats.codes

array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3], dtype=int8)

In [32]:
cats.categories

IntervalIndex([(18, 25], (25, 40], (40, 55], (55, 100]],
              closed='right',
              dtype='interval[int64]')

In [33]:
pd.value_counts(cats)

(55, 100]    5
(40, 55]     5
(25, 40]     4
(18, 25]     3
dtype: int64

### labelling the bins

In [34]:
group_names = ['youth','adults','middle', 'old']

In [35]:
cats = pd.cut(ages, bins, labels=group_names)

In [36]:
cats

[youth, youth, youth, adults, adults, ..., old, old, old, old, old]
Length: 17
Categories (4, object): [youth < adults < middle < old]

In [37]:
cats.categories

Index(['youth', 'adults', 'middle', 'old'], dtype='object')

In [38]:
pd.value_counts(cats)

old       5
middle    5
adults    4
youth     3
dtype: int64

### detecting and Filtering outliers

In [39]:
data = pd.DataFrame(np.random.randn(1000,4))
data.head(5)

Unnamed: 0,0,1,2,3
0,-0.384245,-0.948083,0.516016,0.089577
1,-0.623542,1.423081,-0.360603,1.338572
2,-0.548144,-0.088589,-0.571189,-2.658643
3,-0.090173,0.310851,-0.849239,-0.980353
4,-0.343878,0.791299,0.703435,0.716323


In [44]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.008931,-0.026766,0.016375,-0.003116
std,0.999851,0.929339,0.964589,0.981787
min,-3.206952,-3.175383,-2.966544,-3.149767
25%,-0.651386,-0.621738,-0.639937,-0.671634
50%,-0.021013,-0.043229,-0.024004,0.04304
75%,0.660847,0.599421,0.66811,0.659689
max,3.726139,3.177917,3.089117,3.251997


In [45]:
col = data[2]
col.head(3)

0    0.516016
1   -0.360603
2   -0.571189
Name: 2, dtype: float64

In [46]:
col[np.abs(col) > 3]

144    3.024047
582    3.089117
Name: 2, dtype: float64

In [48]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
43,-0.455915,-0.350171,-0.827039,3.251997
144,1.057441,0.196142,3.024047,-0.435525
152,0.847853,3.177917,-0.689783,-0.33724
376,-1.056297,3.099287,-0.158388,0.339075
450,3.28542,-0.339228,0.979537,0.441354
582,-0.931077,0.016665,3.089117,1.208326
765,-3.206952,0.739048,-0.922865,0.183672
840,-0.747201,-3.175383,-1.528354,0.992084
846,0.068416,-3.166538,0.202542,-0.298178
860,3.055043,0.808731,-0.251677,1.636067


### np.sign produces 1 or -1 depending whether it is negative or positive

In [49]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,1.0,1.0
1,-1.0,1.0,-1.0,1.0
2,-1.0,-1.0,-1.0,-1.0
3,-1.0,1.0,-1.0,-1.0
4,-1.0,1.0,1.0,1.0


### permutation and random sampling

In [50]:
df = pd.DataFrame(np.arange(20).reshape(5,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


### permutation arranges in the given value

In [51]:
sampler = np.random.permutation(5)
sampler

array([1, 0, 2, 4, 3])

In [52]:
sampler1 = np.random.permutation(df)
sampler1

array([[16, 17, 18, 19],
       [12, 13, 14, 15],
       [ 8,  9, 10, 11],
       [ 4,  5,  6,  7],
       [ 0,  1,  2,  3]])

In [53]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
2,8,9,10,11
4,16,17,18,19
3,12,13,14,15


### Indicator / Dummy variables

In [58]:
df = pd.DataFrame({
    'key':[0,1,2,0,2,1,0]
})
df

Unnamed: 0,key
0,0
1,1
2,2
3,0
4,2
5,1
6,0


### converting categorial values into dummy variables

In [59]:
pd.get_dummies(df['key'])

Unnamed: 0,0,1,2
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,0,1
5,0,1,0
6,1,0,0


In [60]:
df = pd.DataFrame({
    'key':[0,1,2,1,2,1,0],
    'data1':range(7)
})
df

Unnamed: 0,key,data1
0,0,0
1,1,1
2,2,2
3,1,3
4,2,4
5,1,5
6,0,6


In [64]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_0,key_1,key_2
0,1,0,0
1,0,1,0
2,0,0,1
3,0,1,0
4,0,0,1
5,0,1,0
6,1,0,0


In [67]:
data1_with_dummies = df[['data1']].join(dummies)
data1_with_dummies

Unnamed: 0,data1,key_0,key_1,key_2
0,0,1,0,0
1,1,0,1,0
2,2,0,0,1
3,3,0,1,0
4,4,0,0,1
5,5,0,1,0
6,6,1,0,0
