In [1]:
import numpy as np
import pandas as pd

In [2]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [3]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

In [6]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [7]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [8]:
float_data = pd.Series([1, 2, None], dtype='float64')

In [9]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [10]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

In [11]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [12]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [23]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.], [np.nan, 420, 69]])

In [24]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0
4,,420.0,69.0


In [25]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [28]:
data[5] = np.nan

In [29]:
data

Unnamed: 0,0,1,2,4,5
0,1.0,6.5,3.0,,
1,1.0,,,,
2,,,,,
3,,6.5,3.0,,
4,,420.0,69.0,,


In [30]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0
4,,420.0,69.0


In [33]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [34]:
df.iloc[:4, 1] = np.nan

In [35]:
df.iloc[:2, 2] = np.nan

In [36]:
df

Unnamed: 0,0,1,2
0,0.318494,,
1,-0.432106,,
2,0.940642,,0.066049
3,0.454048,,-0.191919
4,-1.915565,1.233931,0.224561
5,0.935829,-0.016057,0.837446
6,-0.863044,-0.959373,0.669841


In [37]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.915565,1.233931,0.224561
5,0.935829,-0.016057,0.837446
6,-0.863044,-0.959373,0.669841


In [38]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.940642,,0.066049
3,0.454048,,-0.191919
4,-1.915565,1.233931,0.224561
5,0.935829,-0.016057,0.837446
6,-0.863044,-0.959373,0.669841


In [39]:
#Filling in missing data
df.fillna(0)

Unnamed: 0,0,1,2
0,0.318494,0.0,0.0
1,-0.432106,0.0,0.0
2,0.940642,0.0,0.066049
3,0.454048,0.0,-0.191919
4,-1.915565,1.233931,0.224561
5,0.935829,-0.016057,0.837446
6,-0.863044,-0.959373,0.669841


In [41]:
df.fillna({1: 0.5, 2: 420})

Unnamed: 0,0,1,2
0,0.318494,0.5,420.0
1,-0.432106,0.5,420.0
2,0.940642,0.5,0.066049
3,0.454048,0.5,-0.191919
4,-1.915565,1.233931,0.224561
5,0.935829,-0.016057,0.837446
6,-0.863044,-0.959373,0.669841


In [42]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))

In [43]:
df.iloc[2:, 1] = np.nan

In [44]:
df.iloc[4:, 2] = np.nan

In [45]:
df

Unnamed: 0,0,1,2
0,-1.263069,-1.957746,-0.068525
1,0.049782,-0.661312,0.248013
2,-1.384408,,1.080643
3,0.930467,,-0.962672
4,0.992124,,
5,-1.163337,,


In [46]:
df.fillna(method="ffill")

Unnamed: 0,0,1,2
0,-1.263069,-1.957746,-0.068525
1,0.049782,-0.661312,0.248013
2,-1.384408,-0.661312,1.080643
3,0.930467,-0.661312,-0.962672
4,0.992124,-0.661312,-0.962672
5,-1.163337,-0.661312,-0.962672


In [47]:
df.fillna(method="ffill", limit=2)

Unnamed: 0,0,1,2
0,-1.263069,-1.957746,-0.068525
1,0.049782,-0.661312,0.248013
2,-1.384408,-0.661312,1.080643
3,0.930467,-0.661312,-0.962672
4,0.992124,,-0.962672
5,-1.163337,,-0.962672


In [48]:
#With fillna you can do lots of other thing such as simple data imputation using the median or mean statistics
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

In [49]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [55]:
#Data Transformation
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "K2": [1, 1, 2, 3, 3, 4, 4], "k3": [420, 69, 420.69, 12, 666, 6, 7]})

In [56]:
data

Unnamed: 0,k1,K2,k3
0,one,1,420.0
1,two,1,69.0
2,one,2,420.69
3,two,3,12.0
4,one,3,666.0
5,two,4,6.0
6,two,4,7.0


In [57]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [58]:
data.drop_duplicates()

Unnamed: 0,k1,K2,k3
0,one,1,420.0
1,two,1,69.0
2,one,2,420.69
3,two,3,12.0
4,one,3,666.0
5,two,4,6.0
6,two,4,7.0


In [59]:
data["v1"] = range(7)

In [60]:
data

Unnamed: 0,k1,K2,k3,v1
0,one,1,420.0,0
1,two,1,69.0,1
2,one,2,420.69,2
3,two,3,12.0,3
4,one,3,666.0,4
5,two,4,6.0,5
6,two,4,7.0,6


In [61]:
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,K2,k3,v1
0,one,1,420.0,0
1,two,1,69.0,1


In [63]:
data.drop_duplicates(["k1", "K2"], keep="last")

Unnamed: 0,k1,K2,k3,v1
0,one,1,420.0,0
1,two,1,69.0,1
2,one,2,420.69,2
3,two,3,12.0,3
4,one,3,666.0,4
6,two,4,7.0,6


In [64]:
#Transforming Data using a Function or Mapping
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"], "ounces": [4, 3, 12, 6, 7.7, 8, 3, 5, 6]})

In [65]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.7
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [66]:
 meat_to_animal = {
      "bacon": "pig",
      "pulled pork": "pig",
      "pastrami": "cow",
      "corned beef": "cow",
      "honey ham": "pig",
      "nova lox": "salmon"
}

In [67]:
data["animal"] = data["food"].map(meat_to_animal)

In [68]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.7,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [69]:
def get_animal(x):
    return meat_to_animal[x]

In [70]:
data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object