# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(data={'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [3]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [4]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [5]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [6]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [7]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [8]:
print(df.dropna(thresh=1))
print("=========================")
print(df.dropna(thresh=2))
print("=========================")
print(df.dropna(thresh=3))

     A    B  C
0  1.0  5.0  1
1  2.0  NaN  2
2  NaN  NaN  3
     A    B  C
0  1.0  5.0  1
1  2.0  NaN  2
     A    B  C
0  1.0  5.0  1


In [10]:
df.fillna(value=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,0.0,2
2,0.0,0.0,3


In [11]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [12]:
df['A'].mean()

1.5

In [13]:
df['A'] = df['A'].fillna(value = df['A'].mean())

In [14]:
df['A']

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

In [15]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,1.5,,3


# Groupby

The groupby method allows you to group rows of data together and call aggregate functions

In [16]:
import pandas as pd
# Create dataframe
data = {'Company':['A','A','B','B','C','C'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}


In [17]:
df = pd.DataFrame(data)

In [18]:
df

Unnamed: 0,Company,Person,Sales
0,A,Sam,200
1,A,Charlie,120
2,B,Amy,340
3,B,Vanessa,124
4,C,Carl,243
5,C,Sarah,350


In [19]:
df.groupby("Company").count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2,2
B,2,2
C,2,2


In [20]:
df.groupby('Company').mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
A,160.0
B,232.0
C,296.5
