In [1]:
import numpy as np
import pandas as pd

### Handling Missing Values

**dropna()**

In [2]:
dict = [
    [1,2,3,4,5],
    [1,2,3,4,7],
    [1,2,3,4,np.nan],
    [1,2,np.nan,4,np.nan],
    [1,2,3,None,np.nan],
    [1,None,3,4,np.nan],
    [np.nan for i in range(5)]
]

df = pd.DataFrame(dict, columns=['First', 'Second', 'Third', 'Forth', 'Fifth'])
df

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1.0,2.0,3.0,4.0,5.0
1,1.0,2.0,3.0,4.0,7.0
2,1.0,2.0,3.0,4.0,
3,1.0,2.0,,4.0,
4,1.0,2.0,3.0,,
5,1.0,,3.0,4.0,
6,,,,,


In [3]:
# Series
df.loc[::, 'Forth'].dropna()

0    4.0
1    4.0
2    4.0
3    4.0
5    4.0
Name: Forth, dtype: float64

In [6]:
# Dataframe
df.dropna()

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1.0,2.0,3.0,4.0,5.0
1,1.0,2.0,3.0,4.0,7.0


In [50]:
df.dropna(how='all') # Only those were all are null

Unnamed: 0,First,Second,Third,Forth,Fifth,6
0,1,2.0,3.0,4.0,5.0,
1,1,2.0,3.0,4.0,7.0,
2,1,2.0,3.0,4.0,,
3,1,2.0,,4.0,,
4,1,2.0,3.0,,,
5,1,,3.0,4.0,,


In [59]:
df.dropna(thresh=4) # Atleast 4 values in a row are not null

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1.0,2.0,3.0,4.0,5.0
1,1.0,2.0,3.0,4.0,7.0
2,1.0,2.0,3.0,4.0,


### fillna()- Deprecated with method attribute

In [60]:
# Series
df.loc[::, 'Forth'].fillna(method= 'bfill')

  df.loc[::, 'Forth'].fillna(method= 'bfill')


0    4.0
1    4.0
2    4.0
3    4.0
4    4.0
5    4.0
6    NaN
Name: Forth, dtype: float64

In [61]:
# Dataframe
df.fillna({'Forth': 0, 'Fifth': 100})

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1.0,2.0,3.0,4.0,5.0
1,1.0,2.0,3.0,4.0,7.0
2,1.0,2.0,3.0,4.0,100.0
3,1.0,2.0,,4.0,100.0
4,1.0,2.0,3.0,0.0,100.0
5,1.0,,3.0,4.0,100.0
6,,,,0.0,100.0


In [62]:
df.fillna(0, limit=1, axis=1) # how many values has to get fill specified by limit

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1.0,2.0,3.0,4.0,5.0
1,1.0,2.0,3.0,4.0,7.0
2,1.0,2.0,3.0,4.0,0.0
3,1.0,2.0,0.0,4.0,
4,1.0,2.0,3.0,0.0,
5,1.0,0.0,3.0,4.0,
6,0.0,,,,


**ffill and bfill**

In [65]:
df.loc[::, 'Fifth'].ffill()

0    5.0
1    7.0
2    7.0
3    7.0
4    7.0
5    7.0
6    7.0
Name: Fifth, dtype: float64

**Boolean Arrays and Dataframe**

In [66]:
df.loc[::, 'Forth'].isnull()

0    False
1    False
2    False
3    False
4     True
5    False
6     True
Name: Forth, dtype: bool

In [67]:
df.loc[::, 'Forth'].notnull()

0     True
1     True
2     True
3     True
4    False
5     True
6    False
Name: Forth, dtype: bool

In [68]:
df.loc[::, 'Forth'].notna()

0     True
1     True
2     True
3     True
4    False
5     True
6    False
Name: Forth, dtype: bool

### Handling Duplicate values

In [76]:
dict = [
    [1,2,3,4,5],
    [1,2,3,4,np.nan],
    [1,2,np.nan,4,np.nan],
    [1,2,3,4,5],
    [1,2,3,4,np.nan],
    [1,2,3,4,np.nan]
]

df = pd.DataFrame(dict, columns=['First', 'Second', 'Third', 'Forth', 'Fifth'])
df

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1,2,3.0,4,5.0
1,1,2,3.0,4,
2,1,2,,4,
3,1,2,3.0,4,5.0
4,1,2,3.0,4,
5,1,2,3.0,4,


**duplicated()**

In [77]:
# duplicated()
df['Fifth'].duplicated()

0    False
1    False
2     True
3     True
4     True
5     True
Name: Fifth, dtype: bool

In [78]:
df.duplicated() # Finds an entire row which is duplicate
# There is np axis attribute

0    False
1    False
2    False
3     True
4     True
5     True
dtype: bool

**drop_duplicate()**

In [79]:
df.drop_duplicates()

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1,2,3.0,4,5.0
1,1,2,3.0,4,
2,1,2,,4,


In [80]:
df.drop_duplicates(['Third', 'Fifth']) # Here the both must be duplicated at same record

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1,2,3.0,4,5.0
1,1,2,3.0,4,
2,1,2,,4,


In [81]:
df.drop_duplicates(keep='last') # Last occurance of duplicated rows

Unnamed: 0,First,Second,Third,Forth,Fifth
2,1,2,,4,
3,1,2,3.0,4,5.0
5,1,2,3.0,4,


### Data Transformation - Mapping

In [86]:
data = {
    'color': ['red', 'blue', 'orange', 'black', 'pink'],
    'opacity': [0.5, 1, 0.6, 0.2, 0.7, ]
}

colors = pd.DataFrame(data)
colors

Unnamed: 0,color,opacity
0,red,0.5
1,blue,1.0
2,orange,0.6
3,black,0.2
4,pink,0.7


In [87]:
obj = {
    'orange': 'orange',
    'blue': 'sea',
    'red': 'apple',
    'black': 'stone',
    'pink': 'flower'
}

In [110]:
clr = colors['color']

In [89]:
# clr.map(lambda x: obj[x])
colors['obj'] = clr.map(obj)

In [90]:
colors

Unnamed: 0,color,opacity,obj
0,red,0.5,apple
1,blue,1.0,sea
2,orange,0.6,orange
3,black,0.2,stone
4,pink,0.7,flower


### Replacing Values - replace()

In [116]:
df.replace(1, 1000)

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1000,2,3.0,4,5.0
1,1000,2,3.0,4,
2,1000,2,,4,
3,1000,2,3.0,4,5.0
4,1000,2,3.0,4,
5,1000,2,3.0,4,


In [117]:
df.replace([1000, 2], [1.1, 2.1])

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1,2.1,3.0,4,5.0
1,1,2.1,3.0,4,
2,1,2.1,,4,
3,1,2.1,3.0,4,5.0
4,1,2.1,3.0,4,
5,1,2.1,3.0,4,


In [118]:
df.replace({1: 1.12, 2.1: 2.12})

Unnamed: 0,First,Second,Third,Forth,Fifth
0,1.12,2,3.0,4,5.0
1,1.12,2,3.0,4,
2,1.12,2,,4,
3,1.12,2,3.0,4,5.0
4,1.12,2,3.0,4,
5,1.12,2,3.0,4,


**Renaming - columns and indices: rename()**

In [123]:
df.index = ['a', 'b', 'c', 'd', 'e',  'f']
df

Unnamed: 0,First,Second,Third,Forth,Fifth
a,1,2,3.0,4,5.0
b,1,2,3.0,4,
c,1,2,,4,
d,1,2,3.0,4,5.0
e,1,2,3.0,4,
f,1,2,3.0,4,


In [129]:
df.rename(index=str.capitalize, columns=str.title)

Unnamed: 0,First,Second,Third,Forth,Fifth
A,1,2,3.0,4,5.0
B,1,2,3.0,4,
C,1,2,,4,
D,1,2,3.0,4,5.0
E,1,2,3.0,4,
F,1,2,3.0,4,


In [130]:
# You can also change perticular index / column
df.rename(index={'C': 'c'}, columns={'Second': 'second'})

Unnamed: 0,First,second,Third,Forth,Fifth
a,1,2,3.0,4,5.0
b,1,2,3.0,4,
c,1,2,,4,
d,1,2,3.0,4,5.0
e,1,2,3.0,4,
f,1,2,3.0,4,


### Discretization and Binning

In [18]:
ages = np.array([20, 56, 78, 30, 44, 78, 90, 23, 45, 77])
bins = [1, 25, 50, 75, 100]

cats = pd.cut(ages, bins)

In [135]:
cats.codes # the number of bin the age were belongs

array([0, 2, 3, 1, 1, 3, 3, 0, 1, 3], dtype=int8)

In [136]:
cats.categories

IntervalIndex([(1, 25], (25, 50], (50, 75], (75, 100]], dtype='interval[int64, right]')

In [137]:
cats.value_counts()

(1, 25]      2
(25, 50]     3
(50, 75]     1
(75, 100]    4
Name: count, dtype: int64

In [3]:
# Providing labels
ages = np.array([20, 56, 78, 30, 44, 78, 90, 23, 45, 77])
bins = [1, 25, 50, 75, 100]
labels = ['first', 'second', 'third', 'forth']

pd.cut(ages, bins = bins, labels=labels)

['first', 'third', 'forth', 'second', 'second', 'forth', 'forth', 'first', 'second', 'forth']
Categories (4, object): ['first' < 'second' < 'third' < 'forth']

**qcut()**

In [20]:
# qcut method - quantile means you have to fit an entire data into n quantiles so that the meaning of data is same but you are counting the data by the quantile number
ages = np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

age_cut = pd.cut(ages, 5)
age_qcut = pd.qcut(ages, 5)

In [21]:
print(age_cut.categories)
print(age_qcut.categories)

IntervalIndex([(-0.1, 20.0], (20.0, 40.0], (40.0, 60.0], (60.0, 80.0],
               (80.0, 100.0]],
              dtype='interval[float64, right]')
IntervalIndex([(-0.001, 20.0], (20.0, 40.0], (40.0, 60.0], (60.0, 80.0],
               (80.0, 100.0]],
              dtype='interval[float64, right]')


In [15]:
age_cut.value_counts()

(-0.1, 20.0]     3
(20.0, 40.0]     1
(40.0, 60.0]     3
(60.0, 80.0]     1
(80.0, 100.0]    1
Name: count, dtype: int64

### Filtering Outliers

In [31]:
rnd_df = pd.DataFrame(np.random.randn(1000, 4))
rnd_df.head()

Unnamed: 0,0,1,2,3
0,0.581826,-0.732167,1.352278,1.114524
1,-0.925389,0.265747,0.782785,3.50259
2,-1.020091,-0.519645,0.482268,-1.75563
3,-0.735754,-1.08327,0.558705,0.621777
4,-0.11726,-1.434328,0.748313,0.166132


In [36]:
rnd_df[(np.abs(rnd_df) > 3).any(axis = 1)]  = np.sign(rnd_df) * 3

In [37]:
rnd_df.head()

Unnamed: 0,0,1,2,3
0,0.581826,-0.732167,1.352278,1.114524
1,-3.0,3.0,3.0,3.0
2,-1.020091,-0.519645,0.482268,-1.75563
3,-0.735754,-1.08327,0.558705,0.621777
4,-0.11726,-1.434328,0.748313,0.166132


### Permutation and Random Sampling

In [5]:
# Permutation
prm = np.random.permutation(5) # From 0 to 4
print('The order of indexes: ', prm)
rnd_df.take(prm)

The order of indexes:  [2 0 3 4 1]


Unnamed: 0,0,1,2,3
2,0.278024,-0.317496,1.045805,0.049392
0,-0.642963,0.026687,-0.257372,-0.546315
3,-0.66787,-0.873943,-0.005132,-0.711254
4,-1.047337,0.082897,0.090992,0.469829
1,-0.871513,0.160396,-1.574106,-0.319318


In [38]:
# Random Sampling
rnd_df.sample(5, replace = True)

Unnamed: 0,0,1,2,3
740,1.281924,0.789418,-0.588269,1.132941
146,-1.496027,-0.594747,0.569777,-0.077447
782,-0.787642,-1.118322,0.499504,-1.439848
943,-1.029019,0.76617,1.533129,-1.684954
158,1.269786,-0.650761,-2.665828,0.033779


### Computing Indicator/Dummy Variables

In [73]:
columns = ["movieid", "title", "genre"]
movies = pd.read_table("movies.dat", names = columns, sep = "::", encoding = "latin1", engine = "python")
movies.head()

Unnamed: 0,movieid,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [75]:
genres = movies['genre'].str.split('|', expand=True).stack().unique()
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [29]:
# Creating a dataframe of all 0 just like dummies
dummies = pd.DataFrame(np.zeros((movies.shape[0], len(genres))), columns=genres)
dummies.head(3)

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
for index, value in enumerate(movies['genre']):
    indices = dummies.columns.get_indexer(value.split('|'))
    dummies.iloc[index, indices] = 1 # Used fancy indexing concept

dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# get_dummies method for simple transformation
temp_df = pd.DataFrame({
    'key': ['b', 'a', 'c', 'b', 'c'],
    'data1': range(5)
})

pd.get_dummies(temp_df['key'], dtype=np.int16)

Unnamed: 0,a,b,c
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1
