# Data Exploration
The goal of this notebook is to unnderstand the Istanbul weather dataset, clean it and remove unnecessary attributes, and eventually prepare it for training the deep learning model.

In [48]:
# Import libraries
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [49]:
# Open csv as DF
df = pd.read_csv('/content/istanbul_weather_last_10_years.csv')
df.head()

Unnamed: 0,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2015-06-22,20.6,18.3,22.9,,,,25.9,,,
1,2015-06-23,21.8,19.0,25.0,,,,20.2,,,
2,2015-06-24,21.8,19.0,25.8,,,,10.2,,,
3,2015-06-25,21.9,17.0,26.1,,,,11.9,,,
4,2015-06-26,22.3,19.5,26.0,,,,11.1,,,


In [50]:
# Create new df including necessary attrebutes
new_df = df[['time', 'tavg', 'tmin', 'tmax', 'wspd']].copy()
new_df.head()

Unnamed: 0,time,tavg,tmin,tmax,wspd
0,2015-06-22,20.6,18.3,22.9,25.9
1,2015-06-23,21.8,19.0,25.0,20.2
2,2015-06-24,21.8,19.0,25.8,10.2
3,2015-06-25,21.9,17.0,26.1,11.9
4,2015-06-26,22.3,19.5,26.0,11.1


In [51]:
# Rename columns
new_df.rename(columns={'time': 'date', 'tavg': 'avg_temp', 'tmin': 'min_temp', 'tmax': 'max_temp', 'wspd': 'wind_speed'}, inplace=True)
new_df.head()

Unnamed: 0,date,avg_temp,min_temp,max_temp,wind_speed
0,2015-06-22,20.6,18.3,22.9,25.9
1,2015-06-23,21.8,19.0,25.0,20.2
2,2015-06-24,21.8,19.0,25.8,10.2
3,2015-06-25,21.9,17.0,26.1,11.9
4,2015-06-26,22.3,19.5,26.0,11.1


In [52]:
# Find missing values
new_df.isnull().sum()

Unnamed: 0,0
date,0
avg_temp,0
min_temp,3
max_temp,3
wind_speed,0


In [53]:
# fill missing values with forward fill (last known value)
new_df.ffill(inplace=True)
new_df.isnull().sum()

Unnamed: 0,0
date,0
avg_temp,0
min_temp,0
max_temp,0
wind_speed,0


In [54]:
# Check duplicated values
new_df.duplicated().sum()

np.int64(0)

In [55]:
# Check data types
new_df.dtypes

Unnamed: 0,0
date,object
avg_temp,float64
min_temp,float64
max_temp,float64
wind_speed,float64


In [56]:
# Convert `date` data type from object to datetime
new_df['date'] = pd.to_datetime(new_df['date'])
new_df.dtypes

Unnamed: 0,0
date,datetime64[ns]
avg_temp,float64
min_temp,float64
max_temp,float64
wind_speed,float64


In [57]:
# Set date as index
new_df.set_index('date', inplace=True)
new_df.head()

Unnamed: 0_level_0,avg_temp,min_temp,max_temp,wind_speed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-06-22,20.6,18.3,22.9,25.9
2015-06-23,21.8,19.0,25.0,20.2
2015-06-24,21.8,19.0,25.8,10.2
2015-06-25,21.9,17.0,26.1,11.9
2015-06-26,22.3,19.5,26.0,11.1


In [58]:
# Save cleaned dataset
new_df.to_csv('cleaned_weather_data')