In [34]:
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
%matplotlib inline

dataset = pd.read_csv("weather-data.csv")
# change the "valid" column value type from object to datetime
dataset['valid'] = pd.to_datetime(dataset['valid'])
dataset.head() # Show the first five results.


Unnamed: 0,station,valid,tmpc,dwpc,relh,sknt,gust,peak_wind_drct
0,NZAA,2015-01-01 00:00:00,21.0,11.0,52.77,15.0,,
1,NZAA,2015-01-01 00:30:00,21.0,10.0,49.37,16.0,,
2,NZAA,2015-01-01 01:00:00,21.0,12.0,56.38,16.0,,
3,NZAA,2015-01-01 01:30:00,21.0,13.0,60.21,16.0,,
4,NZAA,2015-01-01 02:00:00,21.0,12.0,56.38,16.0,,


In [35]:
# We do not need the peak_wind_drct so remove them.
# We also dont need gust because it didnot happend very often based on EDA so we will remove this colomn as well
# We will also remove all other columns which we dont need ( I want predict only temperature based on day)
dataset.drop(columns=["peak_wind_drct", "gust"," station","dwpc","relh","sknt"], inplace=True)
dataset.head()

Unnamed: 0,valid,tmpc
0,2015-01-01 00:00:00,21.0
1,2015-01-01 00:30:00,21.0
2,2015-01-01 01:00:00,21.0
3,2015-01-01 01:30:00,21.0
4,2015-01-01 02:00:00,21.0


In [36]:
#For our future prediction we dont need temperature each 30 minutes so to reduce these ammount of data we should split "valid" column into
#  2 columns: date and time to reduce numbers of rows late
dataset['date'] = pd.to_datetime(dataset['valid']).dt.date
dataset['time'] = pd.to_datetime(dataset['valid']).dt.time
dataset.head()


Unnamed: 0,valid,tmpc,date,time
0,2015-01-01 00:00:00,21.0,2015-01-01,00:00:00
1,2015-01-01 00:30:00,21.0,2015-01-01,00:30:00
2,2015-01-01 01:00:00,21.0,2015-01-01,01:00:00
3,2015-01-01 01:30:00,21.0,2015-01-01,01:30:00
4,2015-01-01 02:00:00,21.0,2015-01-01,02:00:00


In [37]:
# Now we can remove"valid" column as well
dataset.drop(columns=["valid"], inplace=True)
dataset.head()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103713 entries, 0 to 103712
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   tmpc    103678 non-null  float64
 1   date    103713 non-null  object 
 2   time    103713 non-null  object 
dtypes: float64(1), object(2)
memory usage: 2.4+ MB


In [38]:
# I want predict only midday temperature based on midday temperature so we will drop other rows
# befor it we need change value for time on string because after we split value is object again for time and date columns
dataset['time'] = dataset['time'].astype("string")
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103713 entries, 0 to 103712
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   tmpc    103678 non-null  float64
 1   date    103713 non-null  object 
 2   time    103713 non-null  string 
dtypes: float64(1), object(1), string(1)
memory usage: 2.4+ MB


In [39]:
# now we can remove all rows and leave only temperature at 12PM every day
dataset = dataset.loc[dataset["time"] == "12:00:00"]  
dataset.head()

Unnamed: 0,tmpc,date,time
24,15.0,2015-01-01,12:00:00
72,17.0,2015-01-02,12:00:00
120,16.0,2015-01-03,12:00:00
168,18.0,2015-01-04,12:00:00
216,18.0,2015-01-05,12:00:00


In [40]:
# Now I'll split date column into 3: day,mounth, year

dataset['yyyy'] = pd.to_datetime(dataset['date']).dt.year
dataset['mm'] = pd.to_datetime(dataset['date']).dt.month
dataset['dd'] = pd.to_datetime(dataset['date']).dt.day
#dataset.drop(columns=["date"], inplace=True)
dataset.head()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2171 entries, 24 to 103689
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tmpc    2170 non-null   float64
 1   date    2171 non-null   object 
 2   time    2171 non-null   string 
 3   yyyy    2171 non-null   int64  
 4   mm      2171 non-null   int64  
 5   dd      2171 non-null   int64  
dtypes: float64(1), int64(3), object(1), string(1)
memory usage: 118.7+ KB


In [41]:
#after it we dont need date column and can delete it, as a time column
dataset.drop(columns=["date","time"], inplace=True)
dataset.head()

Unnamed: 0,tmpc,yyyy,mm,dd
24,15.0,2015,1,1
72,17.0,2015,1,2
120,16.0,2015,1,3
168,18.0,2015,1,4
216,18.0,2015,1,5


In [42]:
# We can drop year column because we will predict temperature for days
dataset.drop(columns=["yyyy"], inplace=True)
dataset.head()

Unnamed: 0,tmpc,mm,dd
24,15.0,1,1
72,17.0,1,2
120,16.0,1,3
168,18.0,1,4
216,18.0,1,5


In [43]:
# I want predict temperature for example only for august  in this case I can delete all othere rows for other mounths
dataset = dataset.loc[dataset["mm"] == 8]  
dataset.head()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 185 entries, 9885 to 97889
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tmpc    185 non-null    float64
 1   mm      185 non-null    int64  
 2   dd      185 non-null    int64  
dtypes: float64(1), int64(2)
memory usage: 5.8 KB


In [45]:
# finally we can remove mounth column as well
dataset.drop(columns=["mm"], inplace=True)
dataset.head()

Unnamed: 0,tmpc,dd
9885,10.0,1
9933,14.0,2
9977,14.0,3
10058,14.0,5
10098,11.0,6


In [46]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 185 entries, 9885 to 97889
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tmpc    185 non-null    float64
 1   dd      185 non-null    int64  
dtypes: float64(1), int64(1)
memory usage: 4.3 KB
