In [103]:
import numpy as np

In [104]:
frame = pd.DataFrame({'Humidity':np.arange(50,64)})

frame.loc[[3,7,10,11],'Humidity'] = np.nan
print(frame)
frame.Humidity.rolling(4,min_periods=1).mean()

    Humidity
0       50.0
1       51.0
2       52.0
3        NaN
4       54.0
5       55.0
6       56.0
7        NaN
8       58.0
9       59.0
10       NaN
11       NaN
12      62.0
13      63.0


0     50.000000
1     50.500000
2     51.000000
3     51.000000
4     52.333333
5     53.666667
6     55.000000
7     55.000000
8     56.333333
9     57.666667
10    58.500000
11    58.500000
12    60.500000
13    62.500000
Name: Humidity, dtype: float64

# Rolling Average Method
Here ts values are filled using RA. Here we have sliding window of 4. All the averages are calculated in parallel fashion. It is also known as Moving Average and helps to smoothens the data.
min_periods: Minimum number of observations in window required to have a value. 

In [105]:
frame = pd.DataFrame({'Humidity':np.arange(50,64)})

frame.loc[[3,7,10,11],'Humidity'] = np.nan
print(frame)
frame.Humidity.fillna(frame.Humidity.rolling(4,min_periods=1).mean())

    Humidity
0       50.0
1       51.0
2       52.0
3        NaN
4       54.0
5       55.0
6       56.0
7        NaN
8       58.0
9       59.0
10       NaN
11       NaN
12      62.0
13      63.0


0     50.0
1     51.0
2     52.0
3     51.0
4     54.0
5     55.0
6     56.0
7     55.0
8     58.0
9     59.0
10    58.5
11    58.5
12    62.0
13    63.0
Name: Humidity, dtype: float64

# Interpolate Method
It uses various interpolation technique to fill the missing values

In [106]:
frame = pd.DataFrame({'Humidity':np.arange(50,64)})

frame.loc[[3,7,10,11],'Humidity'] = np.nan
print(frame)
frame['Humidity'].interpolate(method='linear')

    Humidity
0       50.0
1       51.0
2       52.0
3        NaN
4       54.0
5       55.0
6       56.0
7        NaN
8       58.0
9       59.0
10       NaN
11       NaN
12      62.0
13      63.0


0     50.0
1     51.0
2     52.0
3     53.0
4     54.0
5     55.0
6     56.0
7     57.0
8     58.0
9     59.0
10    60.0
11    61.0
12    62.0
13    63.0
Name: Humidity, dtype: float64

# Forward filling and Backward filling 
Two approaches to fill missing values.
Forward filling means fill missing values with previous data. 
Backward filling means fill missing values with next data point.
limit : If it's specified, this is the maximum number of consecutive NaN values to forward/backward fill.
Downcast:e.g. float64 to int64 if possible.

In [107]:
frame = pd.DataFrame({'Humidity':np.arange(50,64)})

frame.loc[[3,7,10,11],'Humidity'] = np.nan
print(frame)
frame['Humidity'].fillna(value=None, method='backfill', axis=None, limit=None, downcast=None)

    Humidity
0       50.0
1       51.0
2       52.0
3        NaN
4       54.0
5       55.0
6       56.0
7        NaN
8       58.0
9       59.0
10       NaN
11       NaN
12      62.0
13      63.0


0     50.0
1     51.0
2     52.0
3     54.0
4     54.0
5     55.0
6     56.0
7     58.0
8     58.0
9     59.0
10    62.0
11    62.0
12    62.0
13    63.0
Name: Humidity, dtype: float64

In [108]:
frame = pd.DataFrame({'Humidity':np.arange(50,64)})

frame.loc[[3,7,10,11],'Humidity'] = np.nan
print(frame)
frame['Humidity'].fillna(value=None, method='ffill', axis=None, limit=None, downcast=None)

    Humidity
0       50.0
1       51.0
2       52.0
3        NaN
4       54.0
5       55.0
6       56.0
7        NaN
8       58.0
9       59.0
10       NaN
11       NaN
12      62.0
13      63.0


0     50.0
1     51.0
2     52.0
3     52.0
4     54.0
5     55.0
6     56.0
7     56.0
8     58.0
9     59.0
10    59.0
11    59.0
12    62.0
13    63.0
Name: Humidity, dtype: float64

## ----- Practice -----
This is timeseries filling missing problem.

In [110]:
import numpy as np
frame = pd.DataFrame({'Price':np.arange(50,100)})

frame.loc[[12,4,45,46,47,34,33],'Price'] = np.nan
frame


Unnamed: 0,Price
0,50.0
1,51.0
2,52.0
3,53.0
4,
5,55.0
6,56.0
7,57.0
8,58.0
9,59.0


In [111]:
frame.Price.rolling(4,min_periods=1).mean()

0     50.000000
1     50.500000
2     51.000000
3     51.500000
4     52.000000
5     53.333333
6     54.666667
7     56.000000
8     56.500000
9     57.500000
10    58.500000
11    59.500000
12    60.000000
13    61.333333
14    62.666667
15    64.000000
16    64.500000
17    65.500000
18    66.500000
19    67.500000
20    68.500000
21    69.500000
22    70.500000
23    71.500000
24    72.500000
25    73.500000
26    74.500000
27    75.500000
28    76.500000
29    77.500000
30    78.500000
31    79.500000
32    80.500000
33    81.000000
34    81.500000
35    83.500000
36    85.500000
37    86.000000
38    86.500000
39    87.500000
40    88.500000
41    89.500000
42    90.500000
43    91.500000
44    92.500000
45    93.000000
46    93.500000
47    94.000000
48    98.000000
49    98.500000
Name: Price, dtype: float64

# Rolling Average
Here ts values are filled using RA. 

In [117]:
frame.Price.rolling(3,min_periods=1).mean()

0     50.0
1     50.5
2     51.0
3     52.0
4     52.5
5     54.0
6     55.5
7     56.0
8     57.0
9     58.0
10    59.0
11    60.0
12    60.5
13    62.0
14    63.5
15    64.0
16    65.0
17    66.0
18    67.0
19    68.0
20    69.0
21    70.0
22    71.0
23    72.0
24    73.0
25    74.0
26    75.0
27    76.0
28    77.0
29    78.0
30    79.0
31    80.0
32    81.0
33    81.5
34    82.0
35    85.0
36    85.5
37    86.0
38    87.0
39    88.0
40    89.0
41    90.0
42    91.0
43    92.0
44    93.0
45    93.5
46    94.0
47     NaN
48    98.0
49    98.5
Name: Price, dtype: float64

In [113]:
frame = pd.DataFrame({'Price':np.arange(50,100)})

frame.loc[[12,4,45,46,47,34,33],'Price'] = np.nan
print(frame)
frame['Price'].interpolate(method='linear')

    Price
0    50.0
1    51.0
2    52.0
3    53.0
4     NaN
5    55.0
6    56.0
7    57.0
8    58.0
9    59.0
10   60.0
11   61.0
12    NaN
13   63.0
14   64.0
15   65.0
16   66.0
17   67.0
18   68.0
19   69.0
20   70.0
21   71.0
22   72.0
23   73.0
24   74.0
25   75.0
26   76.0
27   77.0
28   78.0
29   79.0
30   80.0
31   81.0
32   82.0
33    NaN
34    NaN
35   85.0
36   86.0
37   87.0
38   88.0
39   89.0
40   90.0
41   91.0
42   92.0
43   93.0
44   94.0
45    NaN
46    NaN
47    NaN
48   98.0
49   99.0


0     50.0
1     51.0
2     52.0
3     53.0
4     54.0
5     55.0
6     56.0
7     57.0
8     58.0
9     59.0
10    60.0
11    61.0
12    62.0
13    63.0
14    64.0
15    65.0
16    66.0
17    67.0
18    68.0
19    69.0
20    70.0
21    71.0
22    72.0
23    73.0
24    74.0
25    75.0
26    76.0
27    77.0
28    78.0
29    79.0
30    80.0
31    81.0
32    82.0
33    83.0
34    84.0
35    85.0
36    86.0
37    87.0
38    88.0
39    89.0
40    90.0
41    91.0
42    92.0
43    93.0
44    94.0
45    95.0
46    96.0
47    97.0
48    98.0
49    99.0
Name: Price, dtype: float64

In [114]:
frame = pd.DataFrame({'Price':np.arange(50,100)})

frame.loc[[12,4,45,46,47,34,33],'Price'] = np.nan
print(frame)
frame['Price'].fillna(value=None, method='backfill', axis=None, limit=None, downcast=None)

    Price
0    50.0
1    51.0
2    52.0
3    53.0
4     NaN
5    55.0
6    56.0
7    57.0
8    58.0
9    59.0
10   60.0
11   61.0
12    NaN
13   63.0
14   64.0
15   65.0
16   66.0
17   67.0
18   68.0
19   69.0
20   70.0
21   71.0
22   72.0
23   73.0
24   74.0
25   75.0
26   76.0
27   77.0
28   78.0
29   79.0
30   80.0
31   81.0
32   82.0
33    NaN
34    NaN
35   85.0
36   86.0
37   87.0
38   88.0
39   89.0
40   90.0
41   91.0
42   92.0
43   93.0
44   94.0
45    NaN
46    NaN
47    NaN
48   98.0
49   99.0


0     50.0
1     51.0
2     52.0
3     53.0
4     55.0
5     55.0
6     56.0
7     57.0
8     58.0
9     59.0
10    60.0
11    61.0
12    63.0
13    63.0
14    64.0
15    65.0
16    66.0
17    67.0
18    68.0
19    69.0
20    70.0
21    71.0
22    72.0
23    73.0
24    74.0
25    75.0
26    76.0
27    77.0
28    78.0
29    79.0
30    80.0
31    81.0
32    82.0
33    85.0
34    85.0
35    85.0
36    86.0
37    87.0
38    88.0
39    89.0
40    90.0
41    91.0
42    92.0
43    93.0
44    94.0
45    98.0
46    98.0
47    98.0
48    98.0
49    99.0
Name: Price, dtype: float64

In [115]:
frame.shape

(50, 1)

In [116]:
frame = pd.DataFrame({'Price':np.arange(50,100)})

frame.loc[[12,4,45,46,47,34,33],'Price'] = np.nan
print(frame)
frame['Price'].fillna(value=None, method='ffill', axis=None, limit=None, downcast=None)

    Price
0    50.0
1    51.0
2    52.0
3    53.0
4     NaN
5    55.0
6    56.0
7    57.0
8    58.0
9    59.0
10   60.0
11   61.0
12    NaN
13   63.0
14   64.0
15   65.0
16   66.0
17   67.0
18   68.0
19   69.0
20   70.0
21   71.0
22   72.0
23   73.0
24   74.0
25   75.0
26   76.0
27   77.0
28   78.0
29   79.0
30   80.0
31   81.0
32   82.0
33    NaN
34    NaN
35   85.0
36   86.0
37   87.0
38   88.0
39   89.0
40   90.0
41   91.0
42   92.0
43   93.0
44   94.0
45    NaN
46    NaN
47    NaN
48   98.0
49   99.0


0     50.0
1     51.0
2     52.0
3     53.0
4     53.0
5     55.0
6     56.0
7     57.0
8     58.0
9     59.0
10    60.0
11    61.0
12    61.0
13    63.0
14    64.0
15    65.0
16    66.0
17    67.0
18    68.0
19    69.0
20    70.0
21    71.0
22    72.0
23    73.0
24    74.0
25    75.0
26    76.0
27    77.0
28    78.0
29    79.0
30    80.0
31    81.0
32    82.0
33    82.0
34    82.0
35    85.0
36    86.0
37    87.0
38    88.0
39    89.0
40    90.0
41    91.0
42    92.0
43    93.0
44    94.0
45    94.0
46    94.0
47    94.0
48    98.0
49    99.0
Name: Price, dtype: float64