### Import your librares

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

### Load your dataset

In [2]:
df = pd.read_csv('seattle_weather.csv')
df

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,1948-01-01,0.47,51,42,True
1,1948-01-02,0.59,45,36,True
2,1948-01-03,0.42,45,35,True
3,1948-01-04,0.31,45,34,True
4,1948-01-05,0.17,45,32,True
...,...,...,...,...,...
25546,2017-12-10,0.00,49,34,False
25547,2017-12-11,0.00,49,29,False
25548,2017-12-12,0.00,46,32,False
25549,2017-12-13,0.00,48,34,False


In [3]:
df.isnull().sum()

DATE    0
PRCP    3
TMAX    0
TMIN    0
RAIN    3
dtype: int64

In [4]:
df[df['PRCP'].isnull()]

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
18415,1998-06-02,,72,52,
18416,1998-06-03,,66,51,
21067,2005-09-05,,70,52,


### Handle missing data in PRCP and RAIN columns

In [5]:
#Since the data is time dependent, we can use the previous day's value to fill the null value
df.fillna(method='ffill', inplace=True)

### Repeat the following steps for each column within outliers

* Visualize the outliers
* Detect if your data set contains any outliers
* Identify lower & higher limit of outliers
* Drop all outliers
* Visualize the column after processing


In [6]:
df.describe()

Unnamed: 0,PRCP,TMAX,TMIN
count,25551.0,25551.0,25551.0
mean,0.106209,59.544206,44.514226
std,0.23902,12.772984,8.892836
min,0.0,4.0,0.0
25%,0.0,50.0,38.0
50%,0.0,58.0,45.0
75%,0.1,69.0,52.0
max,5.02,103.0,71.0


In [7]:
box1 = px.box(x=df['PRCP'], labels={'x':'PRCP'})
box1.show()

In [8]:
Q1 = np.percentile(df['PRCP'], 25, interpolation = 'midpoint')
Q3 = np.percentile(df['PRCP'], 75, interpolation = 'midpoint')
IQR = Q3 - Q1
lb = Q1 - 1.5*(IQR)
hb = Q3 + 1.5*(IQR)
print('lower bound outliers ', lb)
print('higher bound outliers', hb)

lower bound outliers  -0.15000000000000002
higher bound outliers 0.25


In [9]:
df = df.drop(df[(df['PRCP'] > hb) | (df['PRCP'] < lb)].index, axis=0)
box1 = px.box(x=df['PRCP'], labels={'x':'PRCP'})
box1.show()

In [10]:
box2 = px.box(x=df['TMAX'], labels={'x':'TMAX'})
box2.show()

In [11]:
Q1 = np.percentile(df['TMAX'], 25, interpolation = 'midpoint')
Q3 = np.percentile(df['TMAX'], 75, interpolation = 'midpoint')
IQR = Q3 - Q1
lb = Q1 - 1.5*(IQR)
hb = Q3 + 1.5*(IQR)
print('lower bound outliers ', lb)
print('higher bound outliers', hb)

lower bound outliers  20.0
higher bound outliers 100.0


In [12]:
df = df.drop(df[(df['TMAX'] > hb) | (df['TMAX'] < lb)].index, axis=0)
box2 = px.box(x=df['TMAX'], labels={'x':'TMAX'})
box2.show()

In [13]:
box3 = px.box(x=df['TMIN'], labels={'x':'TMIN'})
box3.show()

In [14]:
Q1 = np.percentile(df['TMIN'], 25, interpolation = 'midpoint')
Q3 = np.percentile(df['TMIN'], 75, interpolation = 'midpoint')
IQR = Q3 - Q1
lb = Q1 - 1.5*(IQR)
hb = Q3 + 1.5*(IQR)
print('lower bound outliers ', lb)
print('higher bound outliers', hb)

lower bound outliers  17.0
higher bound outliers 73.0


In [15]:
df = df.drop(df[(df['TMIN'] > hb) | (df['TMIN'] < lb)].index, axis=0)
box3 = px.box(x=df['TMIN'], labels={'x':'TMIN'})
box3.show()