In [2]:
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

dataset = pd.read_csv("weather-data.csv")
# change the "valid" column value type from object to datetime
dataset['valid'] = pd.to_datetime(dataset['valid'])
dataset.head() # Show the first five results.


Unnamed: 0,station,valid,tmpc,dwpc,relh,sknt,gust,peak_wind_drct
0,NZAA,2015-01-01 00:00:00,21.0,11.0,52.77,15.0,,
1,NZAA,2015-01-01 00:30:00,21.0,10.0,49.37,16.0,,
2,NZAA,2015-01-01 01:00:00,21.0,12.0,56.38,16.0,,
3,NZAA,2015-01-01 01:30:00,21.0,13.0,60.21,16.0,,
4,NZAA,2015-01-01 02:00:00,21.0,12.0,56.38,16.0,,


In [4]:
# We do not need the peak_wind_drct so remove them.
# We also dont need gust because it didnot happend very often based on EDA so we will remove this colomn as well
dataset.drop(columns=["peak_wind_drct", "gust"], inplace=True)
dataset.head()

Unnamed: 0,station,valid,tmpc,dwpc,relh,sknt
0,NZAA,2015-01-01 00:00:00,21.0,11.0,52.77,15.0
1,NZAA,2015-01-01 00:30:00,21.0,10.0,49.37,16.0
2,NZAA,2015-01-01 01:00:00,21.0,12.0,56.38,16.0
3,NZAA,2015-01-01 01:30:00,21.0,13.0,60.21,16.0
4,NZAA,2015-01-01 02:00:00,21.0,12.0,56.38,16.0


In [18]:
#For our future prediction we dont need temperature each 30 minutes so to reduce these ammount of data we should split "valid" column into 2 columns: date and time
#dataset['date'] = pd.to_datetime(dataset['valid']).dt.date
#dataset['time'] = pd.to_datetime(dataset['valid']).dt.time
dataset.head()
# Now we can remove"valid" column as well
#dataset.drop(columns=["valid"], inplace=True)
#dataset.head()

Unnamed: 0,station,tmpc,sknt,Date,Time
0,NZAA,21.0,15.0,2015-01-01,00:00:00
1,NZAA,21.0,16.0,2015-01-01,00:30:00
2,NZAA,21.0,16.0,2015-01-01,01:00:00
3,NZAA,21.0,16.0,2015-01-01,01:30:00
4,NZAA,21.0,16.0,2015-01-01,02:00:00


In [19]:
#To predict weather I think should be enought Temperature of the environment in celsius, Wind Speed in knots and date time, so I will remove other columns
dataset.drop(columns=["dwpc", "relh"], inplace=True)


In [21]:
dataset.head()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103713 entries, 0 to 103712
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0    station  103713 non-null  object 
 1   tmpc      103678 non-null  float64
 2   sknt      103704 non-null  float64
 3   Date      103713 non-null  object 
 4   Time      103713 non-null  object 
dtypes: float64(2), object(3)
memory usage: 4.0+ MB


In [26]:
# I want predict only based temperature and wind on midday so we will drop other rows
#dataset = dataset.loc[dataset["Time"] != "12:00:00"]  

In [30]:
dataset['Time'] = dataset['Time'].astype("string")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103713 entries, 0 to 103712
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0    station  103713 non-null  object 
 1   tmpc      103678 non-null  float64
 2   sknt      103704 non-null  float64
 3   Date      103713 non-null  object 
 4   Time      103713 non-null  string 
dtypes: float64(2), object(2), string(1)
memory usage: 4.7+ MB


In [31]:
# I want predict only based temperature and wind on midday so we will drop other rows
dataset = dataset.loc[dataset["Time"] != "12:00:00"]  

In [32]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101542 entries, 0 to 103712
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0    station  101542 non-null  object 
 1   tmpc      101508 non-null  float64
 2   sknt      101533 non-null  float64
 3   Date      101542 non-null  object 
 4   Time      101542 non-null  string 
dtypes: float64(2), object(2), string(1)
memory usage: 4.6+ MB


In [33]:
dataset = pd.DataFrame(dataset)
dataset=dataset.dropna()


In [35]:
dataset.info()
dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101499 entries, 0 to 103712
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0    station  101499 non-null  object 
 1   tmpc      101499 non-null  float64
 2   sknt      101499 non-null  float64
 3   Date      101499 non-null  object 
 4   Time      101499 non-null  string 
dtypes: float64(2), object(2), string(1)
memory usage: 4.6+ MB


Unnamed: 0,station,tmpc,sknt,Date,Time
0,NZAA,21.0,15.0,2015-01-01,00:00:00
1,NZAA,21.0,16.0,2015-01-01,00:30:00
2,NZAA,21.0,16.0,2015-01-01,01:00:00
3,NZAA,21.0,16.0,2015-01-01,01:30:00
4,NZAA,21.0,16.0,2015-01-01,02:00:00


In [37]:
# I want predict only based temperature and wind on midday so we will drop other rows
dataset = dataset.loc[dataset["Time"] != "12:00:00"] 

In [40]:

dataset.drop(columns=[" station"], inplace=True)

In [41]:
dataset.info()
dataset.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 101499 entries, 0 to 103712
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   tmpc    101499 non-null  float64
 1   sknt    101499 non-null  float64
 2   Date    101499 non-null  object 
 3   Time    101499 non-null  string 
dtypes: float64(2), object(1), string(1)
memory usage: 3.9+ MB


Unnamed: 0,tmpc,sknt,Date,Time
0,21.0,15.0,2015-01-01,00:00:00
1,21.0,16.0,2015-01-01,00:30:00
2,21.0,16.0,2015-01-01,01:00:00
3,21.0,16.0,2015-01-01,01:30:00
4,21.0,16.0,2015-01-01,02:00:00


In [48]:
# I choose only 14:30 time for predict weather
dataset = dataset.loc[dataset["Time"] == "14:30:00"] 

dataset.info()
dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2165 entries, 29 to 103694
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tmpc    2165 non-null   float64
 1   sknt    2165 non-null   float64
 2   Date    2165 non-null   object 
 3   Time    2165 non-null   string 
dtypes: float64(2), object(1), string(1)
memory usage: 84.6+ KB


Unnamed: 0,tmpc,sknt,Date,Time
29,14.0,3.0,2015-01-01,14:30:00
77,17.0,7.0,2015-01-02,14:30:00
125,15.0,1.0,2015-01-03,14:30:00
173,17.0,0.0,2015-01-04,14:30:00
221,17.0,4.0,2015-01-05,14:30:00


In [51]:
#dataset.drop(columns=["Time"], inplace=True)
dataset.info()
dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2165 entries, 29 to 103694
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tmpc    2165 non-null   float64
 1   sknt    2165 non-null   float64
 2   Date    2165 non-null   object 
dtypes: float64(2), object(1)
memory usage: 67.7+ KB


Unnamed: 0,tmpc,sknt,Date
29,14.0,3.0,2015-01-01
77,17.0,7.0,2015-01-02
125,15.0,1.0,2015-01-03
173,17.0,0.0,2015-01-04
221,17.0,4.0,2015-01-05
