<h1>Detecting Missing Values</h1>

In [6]:
import numpy as np
import pandas as pd

df = pd.read_csv('./Data/weather_m4.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 14 columns):
YYYYMMDD           8784 non-null int64
TIME               8784 non-null int64
TEMP               8784 non-null int64
MIN_TEMP_GROUND    1464 non-null float64
PRESSURE           8784 non-null int64
HUMIDITY           8784 non-null int64
VIEW_RANGE         8743 non-null float64
CLOUD              8743 non-null float64
MIST               8741 non-null float64
RAIN               8741 non-null float64
SNOW               8741 non-null float64
THUNDER            8741 non-null float64
ICE                8741 non-null float64
WEATHER_CODE       3862 non-null float64
dtypes: float64(9), int64(5)
memory usage: 960.8 KB


In [7]:
df[['MIN_TEMP_GROUND','VIEW_RANGE', 'CLOUD', 'WEATHER_CODE']].head(15)

Unnamed: 0,MIN_TEMP_GROUND,VIEW_RANGE,CLOUD,WEATHER_CODE
0,,56.0,4.0,10.0
1,,57.0,2.0,10.0
2,,56.0,7.0,23.0
3,,57.0,7.0,51.0
4,,56.0,7.0,22.0
5,3.0,9.0,0.0,34.0
6,,5.0,0.0,34.0
7,,1.0,0.0,34.0
8,,2.0,0.0,32.0
9,,5.0,0.0,32.0


In [8]:
# Display boolean Dataset of NaN values by calling isnull():
df.isnull()

Unnamed: 0,YYYYMMDD,TIME,TEMP,MIN_TEMP_GROUND,PRESSURE,HUMIDITY,VIEW_RANGE,CLOUD,MIST,RAIN,SNOW,THUNDER,ICE,WEATHER_CODE
0,False,False,False,True,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,True,False,False,False,False,False,False,False,False,False,False
7,False,False,False,True,False,False,False,False,False,False,False,False,False,False
8,False,False,False,True,False,False,False,False,False,False,False,False,False,False
9,False,False,False,True,False,False,False,False,False,False,False,False,False,False


In [11]:
# Call any() to take any column and return True if that column contains a NaN value:
df.isnull().any()

YYYYMMDD           False
TIME               False
TEMP               False
MIN_TEMP_GROUND     True
PRESSURE           False
HUMIDITY           False
VIEW_RANGE          True
CLOUD               True
MIST                True
RAIN                True
SNOW                True
THUNDER             True
ICE                 True
WEATHER_CODE        True
dtype: bool

In [17]:
# Use axis=1 to work over rows instead of columns:
df.isnull().any(axis=1)

0        True
1        True
2        True
3        True
4        True
5       False
6        True
7        True
8        True
9        True
10       True
11      False
12       True
13       True
14       True
15       True
16       True
17      False
18       True
19       True
20       True
21       True
22       True
23       True
24       True
25       True
26       True
27       True
28       True
29      False
        ...  
8754     True
8755     True
8756     True
8757     True
8758     True
8759    False
8760     True
8761     True
8762     True
8763     True
8764     True
8765    False
8766     True
8767     True
8768     True
8769     True
8770     True
8771    False
8772     True
8773     True
8774     True
8775     True
8776     True
8777    False
8778     True
8779     True
8780     True
8781     True
8782     True
8783    False
Length: 8784, dtype: bool

In [18]:
# Because the above is a series containing boolean values, can use it to select rows of the dataset containing NaN values:
type(df.isnull().any(axis=1))

pandas.core.series.Series

In [20]:
df[df.isnull().any(axis=1)]

Unnamed: 0,YYYYMMDD,TIME,TEMP,MIN_TEMP_GROUND,PRESSURE,HUMIDITY,VIEW_RANGE,CLOUD,MIST,RAIN,SNOW,THUNDER,ICE,WEATHER_CODE
0,20160101,1,68,,10207,87,56.0,4.0,0.0,0.0,0.0,0.0,0.0,10.0
1,20160101,2,58,,10214,92,57.0,2.0,0.0,0.0,0.0,0.0,0.0,10.0
2,20160101,3,57,,10220,92,56.0,7.0,0.0,1.0,0.0,0.0,0.0,23.0
3,20160101,4,60,,10225,93,57.0,7.0,0.0,1.0,0.0,0.0,0.0,51.0
4,20160101,5,45,,10230,95,56.0,7.0,0.0,1.0,0.0,0.0,0.0,22.0
6,20160101,7,11,,10242,99,5.0,0.0,1.0,0.0,0.0,0.0,0.0,34.0
7,20160101,8,7,,10245,100,1.0,0.0,1.0,0.0,0.0,0.0,1.0,34.0
8,20160101,9,2,,10248,99,2.0,0.0,1.0,0.0,0.0,0.0,1.0,32.0
9,20160101,10,20,,10249,100,5.0,0.0,1.0,0.0,0.0,0.0,0.0,32.0
10,20160101,11,72,,10245,88,64.0,0.0,1.0,0.0,0.0,0.0,0.0,20.0


In [24]:
# Scenario: are there any rows that only contain null values?
# Use the all() function to check this:
df.isnull().all(axis=1).any()

False

In [25]:
# Above means no row contains all NaN values

In [26]:
# Check all columns that have no NaN values at all, use notnull():
df.notnull().all()

YYYYMMDD            True
TIME                True
TEMP                True
MIN_TEMP_GROUND    False
PRESSURE            True
HUMIDITY            True
VIEW_RANGE         False
CLOUD              False
MIST               False
RAIN               False
SNOW               False
THUNDER            False
ICE                False
WEATHER_CODE       False
dtype: bool

In [56]:
# Column MIN_GROUND_TEMP looks like it only has a non-NaN number every 6th entry:
df['MIN_TEMP_GROUND']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
5        3.0
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11     -17.0
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17      47.0
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23      47.0
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29      48.0
        ... 
8754     NaN
8755     NaN
8756     NaN
8757     NaN
8758     NaN
8759    32.0
8760     NaN
8761     NaN
8762     NaN
8763     NaN
8764     NaN
8765    29.0
8766     NaN
8767     NaN
8768     NaN
8769     NaN
8770     NaN
8771    29.0
8772     NaN
8773     NaN
8774     NaN
8775     NaN
8776     NaN
8777    37.0
8778     NaN
8779     NaN
8780     NaN
8781     NaN
8782     NaN
8783    26.0
Name: MIN_TEMP_GROUND, Length: 8784, dtype: float64

In [71]:
# How to check that every 6th value is true?
# Start by making a list containing all positions in the column where I suspect there should be a number:
every_6th_row_list = list(range(5, len(df), 6))
every_6th_row_list

[5,
 11,
 17,
 23,
 29,
 35,
 41,
 47,
 53,
 59,
 65,
 71,
 77,
 83,
 89,
 95,
 101,
 107,
 113,
 119,
 125,
 131,
 137,
 143,
 149,
 155,
 161,
 167,
 173,
 179,
 185,
 191,
 197,
 203,
 209,
 215,
 221,
 227,
 233,
 239,
 245,
 251,
 257,
 263,
 269,
 275,
 281,
 287,
 293,
 299,
 305,
 311,
 317,
 323,
 329,
 335,
 341,
 347,
 353,
 359,
 365,
 371,
 377,
 383,
 389,
 395,
 401,
 407,
 413,
 419,
 425,
 431,
 437,
 443,
 449,
 455,
 461,
 467,
 473,
 479,
 485,
 491,
 497,
 503,
 509,
 515,
 521,
 527,
 533,
 539,
 545,
 551,
 557,
 563,
 569,
 575,
 581,
 587,
 593,
 599,
 605,
 611,
 617,
 623,
 629,
 635,
 641,
 647,
 653,
 659,
 665,
 671,
 677,
 683,
 689,
 695,
 701,
 707,
 713,
 719,
 725,
 731,
 737,
 743,
 749,
 755,
 761,
 767,
 773,
 779,
 785,
 791,
 797,
 803,
 809,
 815,
 821,
 827,
 833,
 839,
 845,
 851,
 857,
 863,
 869,
 875,
 881,
 887,
 893,
 899,
 905,
 911,
 917,
 923,
 929,
 935,
 941,
 947,
 953,
 959,
 965,
 971,
 977,
 983,
 989,
 995,
 1001,
 1007,
 1013,


In [40]:
every_6th_row = pd.Series(range(5, len(df), 6))


In [73]:
# Recall that one can use a list of indices to return a list of rows from a DataSet:
df['MIN_TEMP_GROUND'][every_6th_row]

5         3.0
11      -17.0
17       47.0
23       47.0
29       48.0
35       47.0
41       62.0
47       64.0
53       71.0
59       71.0
65       56.0
71       50.0
77       42.0
83       51.0
89       69.0
95       64.0
101      54.0
107      61.0
113      66.0
119      62.0
125      54.0
131      38.0
137      37.0
143      37.0
149      42.0
155      51.0
161      53.0
167      62.0
173      56.0
179      56.0
        ...  
8609     81.0
8615     84.0
8621     85.0
8627     93.0
8633    107.0
8639    100.0
8645     74.0
8651     75.0
8657     61.0
8663     61.0
8669     62.0
8675     40.0
8681     39.0
8687    -21.0
8693    -45.0
8699    -35.0
8705     32.0
8711    -21.0
8717     -5.0
8723     -8.0
8729     -4.0
8735    -17.0
8741    -17.0
8747     -8.0
8753     11.0
8759     32.0
8765     29.0
8771     29.0
8777     37.0
8783     26.0
Name: MIN_TEMP_GROUND, Length: 1464, dtype: float64

In [74]:
# Check all of the above rows are filled:


df	 every_6th_row	 every_6th_row_list	 np	 pd	 


In [75]:
df['MIN_TEMP_GROUND'][every_6th_row].notnull().all()

True

In [78]:
# Are all other rows empty?
# Now, drop every 6th row using drop(), then check isnull() on the result:
df['MIN_TEMP_GROUND'].drop(every_6th_row).isnull().all()

True

In [86]:
# Can you rewrite this line to use df.loc?

# df.loc(rows, columns)
df.loc[[False, True, True],'MIN_TEMP_GROUND']

1   NaN
2   NaN
Name: MIN_TEMP_GROUND, dtype: float64

<h1>Handling Missing Values</h1>

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 14 columns):
YYYYMMDD           8784 non-null int64
TIME               8784 non-null int64
TEMP               8784 non-null int64
MIN_TEMP_GROUND    1464 non-null float64
PRESSURE           8784 non-null int64
HUMIDITY           8784 non-null int64
VIEW_RANGE         8743 non-null float64
CLOUD              8743 non-null float64
MIST               8741 non-null float64
RAIN               8741 non-null float64
SNOW               8741 non-null float64
THUNDER            8741 non-null float64
ICE                8741 non-null float64
WEATHER_CODE       3862 non-null float64
dtypes: float64(9), int64(5)
memory usage: 960.8 KB


In [91]:
df.drop(columns='WEATHER_CODE').head(5)

Unnamed: 0,YYYYMMDD,TIME,TEMP,MIN_TEMP_GROUND,PRESSURE,HUMIDITY,VIEW_RANGE,CLOUD,MIST,RAIN,SNOW,THUNDER,ICE
0,20160101,1,68,,10207,87,56.0,4.0,0.0,0.0,0.0,0.0,0.0
1,20160101,2,58,,10214,92,57.0,2.0,0.0,0.0,0.0,0.0,0.0
2,20160101,3,57,,10220,92,56.0,7.0,0.0,1.0,0.0,0.0,0.0
3,20160101,4,60,,10225,93,57.0,7.0,0.0,1.0,0.0,0.0,0.0
4,20160101,5,45,,10230,95,56.0,7.0,0.0,1.0,0.0,0.0,0.0


In [92]:
df.drop(columns='WEATHER_CODE', inplace=True)

In [94]:
df['MIN_TEMP_GROUND'].head(12)

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
5      3.0
6      NaN
7      NaN
8      NaN
9      NaN
10     NaN
11   -17.0
Name: MIN_TEMP_GROUND, dtype: float64

In [95]:
df['MIN_TEMP_GROUND'].head(12).fillna(0)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      3.0
6      0.0
7      0.0
8      0.0
9      0.0
10     0.0
11   -17.0
Name: MIN_TEMP_GROUND, dtype: float64

In [96]:
df['MIN_TEMP_GROUND'].head(12).fillna(method='ffill')

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
5      3.0
6      3.0
7      3.0
8      3.0
9      3.0
10     3.0
11   -17.0
Name: MIN_TEMP_GROUND, dtype: float64

In [97]:
df['MIN_TEMP_GROUND'].head(12).fillna(method='bfill')

0      3.0
1      3.0
2      3.0
3      3.0
4      3.0
5      3.0
6    -17.0
7    -17.0
8    -17.0
9    -17.0
10   -17.0
11   -17.0
Name: MIN_TEMP_GROUND, dtype: float64

In [100]:
df['MIN_TEMP_GROUND'].fillna(method='bfill', inplace=True)

In [102]:
# MIN_TEMP_GROUND no longer contains null values:
df.isnull().any()

YYYYMMDD           False
TIME               False
TEMP               False
MIN_TEMP_GROUND    False
PRESSURE           False
HUMIDITY           False
VIEW_RANGE          True
CLOUD               True
MIST                True
RAIN                True
SNOW                True
THUNDER             True
ICE                 True
dtype: bool

In [103]:
# pandas.DataFrame.interpolate does all kinds of interpolation value filling methods.

In [106]:
# Now look for rows with missing values
df.isnull().any(axis=1)

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
8754    False
8755    False
8756    False
8757    False
8758    False
8759    False
8760    False
8761    False
8762    False
8763    False
8764    False
8765    False
8766    False
8767    False
8768    False
8769    False
8770    False
8771    False
8772    False
8773    False
8774    False
8775    False
8776    False
8777    False
8778    False
8779    False
8780    False
8781    False
8782    False
8783    False
Length: 8784, dtype: bool

In [107]:
df[df.isnull().any(axis=1)]

Unnamed: 0,YYYYMMDD,TIME,TEMP,MIN_TEMP_GROUND,PRESSURE,HUMIDITY,VIEW_RANGE,CLOUD,MIST,RAIN,SNOW,THUNDER,ICE
5306,20160809,3,115,95.0,10207,90,,,,,,,
5307,20160809,4,107,95.0,10206,93,,,,,,,
5308,20160809,5,104,95.0,10210,80,,,,,,,
5309,20160809,6,115,95.0,10210,80,,,,,,,
5310,20160809,7,125,110.0,10211,70,,,,,,,
5311,20160809,8,144,110.0,10217,75,,,,,,,
5312,20160809,9,148,110.0,10222,70,,,,,,,
5313,20160809,10,156,110.0,10228,70,,,,,,,
5314,20160809,11,153,110.0,10230,65,,,,,,,
5315,20160809,12,161,110.0,10231,60,,,,,,,


In [109]:
# Note that most of the data is missing in the last 7 columns.
# To look at the date when this occurs, I can use indexing with loc, to select only the dates of these rows:
df.loc[df.isnull().any(axis=1), 'YYYYMMDD']

5306    20160809
5307    20160809
5308    20160809
5309    20160809
5310    20160809
5311    20160809
5312    20160809
5313    20160809
5314    20160809
5315    20160809
5316    20160809
5317    20160809
5596    20160821
5597    20160821
5598    20160821
5599    20160821
5600    20160821
5601    20160821
5602    20160821
5603    20160821
5604    20160821
5605    20160821
5606    20160821
5607    20160821
5608    20160821
5609    20160821
5610    20160821
5611    20160821
5612    20160821
5613    20160821
5614    20160821
5615    20160821
5616    20160822
5617    20160822
5618    20160822
5619    20160822
5620    20160822
5621    20160822
5622    20160822
5623    20160822
5624    20160822
5625    20160822
5626    20160822
Name: YYYYMMDD, dtype: int64

In [110]:
# Now check for the unique values using value_counts():
df.loc[df.isnull().any(axis=1), 'YYYYMMDD'].value_counts()

20160821    20
20160809    12
20160822    11
Name: YYYYMMDD, dtype: int64

In [111]:
# The above shows three days in which the values where null.
# To drop all rows containing null values, use dropna()

nulls_dropped = df.dropna()
nulls_dropped

Unnamed: 0,YYYYMMDD,TIME,TEMP,MIN_TEMP_GROUND,PRESSURE,HUMIDITY,VIEW_RANGE,CLOUD,MIST,RAIN,SNOW,THUNDER,ICE
0,20160101,1,68,3.0,10207,87,56.0,4.0,0.0,0.0,0.0,0.0,0.0
1,20160101,2,58,3.0,10214,92,57.0,2.0,0.0,0.0,0.0,0.0,0.0
2,20160101,3,57,3.0,10220,92,56.0,7.0,0.0,1.0,0.0,0.0,0.0
3,20160101,4,60,3.0,10225,93,57.0,7.0,0.0,1.0,0.0,0.0,0.0
4,20160101,5,45,3.0,10230,95,56.0,7.0,0.0,1.0,0.0,0.0,0.0
5,20160101,6,20,3.0,10237,99,9.0,0.0,1.0,0.0,0.0,0.0,0.0
6,20160101,7,11,-17.0,10242,99,5.0,0.0,1.0,0.0,0.0,0.0,0.0
7,20160101,8,7,-17.0,10245,100,1.0,0.0,1.0,0.0,0.0,0.0,1.0
8,20160101,9,2,-17.0,10248,99,2.0,0.0,1.0,0.0,0.0,0.0,1.0
9,20160101,10,20,-17.0,10249,100,5.0,0.0,1.0,0.0,0.0,0.0,0.0


In [113]:
nulls_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8741 entries, 0 to 8783
Data columns (total 13 columns):
YYYYMMDD           8741 non-null int64
TIME               8741 non-null int64
TEMP               8741 non-null int64
MIN_TEMP_GROUND    8741 non-null float64
PRESSURE           8741 non-null int64
HUMIDITY           8741 non-null int64
VIEW_RANGE         8741 non-null float64
CLOUD              8741 non-null float64
MIST               8741 non-null float64
RAIN               8741 non-null float64
SNOW               8741 non-null float64
THUNDER            8741 non-null float64
ICE                8741 non-null float64
dtypes: float64(8), int64(5)
memory usage: 956.0 KB


In [114]:
# Note that although the total number of values is 8741, the index goes to 8783. This means that the index of the data is not
# nicely continuous anymore:
nulls_dropped[5300:5310]

Unnamed: 0,YYYYMMDD,TIME,TEMP,MIN_TEMP_GROUND,PRESSURE,HUMIDITY,VIEW_RANGE,CLOUD,MIST,RAIN,SNOW,THUNDER,ICE
5300,20160808,21,131,124.0,10211,96,65.0,8.0,0.0,0.0,0.0,0.0,0.0
5301,20160808,22,133,124.0,10209,97,65.0,8.0,0.0,0.0,0.0,0.0,0.0
5302,20160808,23,135,124.0,10209,96,65.0,8.0,0.0,0.0,0.0,0.0,0.0
5303,20160808,24,136,124.0,10208,94,65.0,8.0,0.0,0.0,0.0,0.0,0.0
5304,20160809,1,141,95.0,10205,89,70.0,8.0,0.0,0.0,0.0,0.0,0.0
5305,20160809,2,123,95.0,10199,92,65.0,1.0,0.0,0.0,0.0,1.0,0.0
5318,20160809,15,143,135.0,10234,85,59.0,8.0,0.0,1.0,0.0,0.0,0.0
5319,20160809,16,160,135.0,10231,65,80.0,2.0,0.0,1.0,0.0,0.0,0.0
5320,20160809,17,156,135.0,10231,62,80.0,0.0,0.0,0.0,0.0,0.0,0.0
5321,20160809,18,140,135.0,10227,72,75.0,1.0,0.0,0.0,0.0,0.0,0.0


In [115]:
# A useful parameter when using drop() is thresh; it allows me to determine how many values must be present in a row, to keep the row in my
# dataset
drop_thresh = df.dropna(thresh=7)

In [117]:
drop_thresh[drop_thresh.isnull().any(axis=1)]

Unnamed: 0,YYYYMMDD,TIME,TEMP,MIN_TEMP_GROUND,PRESSURE,HUMIDITY,VIEW_RANGE,CLOUD,MIST,RAIN,SNOW,THUNDER,ICE
5317,20160809,14,167,135.0,10233,65,80.0,2.0,,,,,
5626,20160822,11,193,175.0,10233,88,65.0,8.0,,,,,


In [118]:
# The following are ways to fill in missing data instead of dropping it

In [121]:
rows_to_fill = df.isnull().any(axis=1)
rows_to_fill.head(5)

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [122]:
df[rows_to_fill]

Unnamed: 0,YYYYMMDD,TIME,TEMP,MIN_TEMP_GROUND,PRESSURE,HUMIDITY,VIEW_RANGE,CLOUD,MIST,RAIN,SNOW,THUNDER,ICE
5306,20160809,3,115,95.0,10207,90,,,,,,,
5307,20160809,4,107,95.0,10206,93,,,,,,,
5308,20160809,5,104,95.0,10210,80,,,,,,,
5309,20160809,6,115,95.0,10210,80,,,,,,,
5310,20160809,7,125,110.0,10211,70,,,,,,,
5311,20160809,8,144,110.0,10217,75,,,,,,,
5312,20160809,9,148,110.0,10222,70,,,,,,,
5313,20160809,10,156,110.0,10228,70,,,,,,,
5314,20160809,11,153,110.0,10230,65,,,,,,,
5315,20160809,12,161,110.0,10231,60,,,,,,,


In [124]:
# To use column mean values of the missing cells:
nulls_filled = df.fillna(df.mean())

In [125]:
nulls_filled[rows_to_fill]

Unnamed: 0,YYYYMMDD,TIME,TEMP,MIN_TEMP_GROUND,PRESSURE,HUMIDITY,VIEW_RANGE,CLOUD,MIST,RAIN,SNOW,THUNDER,ICE
5306,20160809,3,115,95.0,10207,90,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016
5307,20160809,4,107,95.0,10206,93,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016
5308,20160809,5,104,95.0,10210,80,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016
5309,20160809,6,115,95.0,10210,80,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016
5310,20160809,7,125,110.0,10211,70,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016
5311,20160809,8,144,110.0,10217,75,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016
5312,20160809,9,148,110.0,10222,70,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016
5313,20160809,10,156,110.0,10228,70,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016
5314,20160809,11,153,110.0,10230,65,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016
5315,20160809,12,161,110.0,10231,60,61.916962,5.476267,0.072989,0.20318,0.001602,0.007093,0.016016


In [127]:
df.fillna(df.mode().iloc[0], inplace=True)

<h1>Removing Outliers</h1>