# Train Validation Test Split

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo 

pd.options.display.max_rows = 1000

In [2]:
air_quality = fetch_ucirepo(id=360)
df = air_quality.data.features.copy()

display(df.head())
print("Shape:", df.shape)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,3/10/2004,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,3/10/2004,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,3/10/2004,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,3/10/2004,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,3/10/2004,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


Shape: (9357, 15)


In [3]:
df['date_time'] = pd.to_datetime(df['Date'] + " " + df['Time'], errors='coerce')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

df[['Date','date_time']].isnull().sum()

Date         0
date_time    0
dtype: int64

In [4]:
df = df.sort_values(by = ['date_time'], ascending=True).reset_index(drop = True)

display(df.head(3))
display(df.tail(3))

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
0,2004-03-10,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578,2004-03-10 18:00:00
1,2004-03-10,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255,2004-03-10 19:00:00
2,2004-03-10,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502,2004-03-10 20:00:00


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
9354,2005-04-04,12:00:00,2.4,1142,-200,12.4,1063,293,603,175,1241,1092,26.9,18.3,0.6406,2005-04-04 12:00:00
9355,2005-04-04,13:00:00,2.1,1003,-200,9.5,961,235,702,156,1041,770,28.3,13.5,0.5139,2005-04-04 13:00:00
9356,2005-04-04,14:00:00,2.2,1071,-200,11.9,1047,265,654,168,1129,816,28.5,13.1,0.5028,2005-04-04 14:00:00


In [5]:
df_train = df[df['Date']<="2004-10-31"].reset_index(drop = True)
df_valid = df[(df['Date']>"2004-10-31")&(df['Date']<="2004-12-31")].reset_index(drop = True)
df_test = df[df['Date']>"2004-12-31"].reset_index(drop = True)

In [6]:
display(df_train.head(3))
display(df_train.tail(3))
print("Shape: ", df_train.shape)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
0,2004-03-10,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578,2004-03-10 18:00:00
1,2004-03-10,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255,2004-03-10 19:00:00
2,2004-03-10,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502,2004-03-10 20:00:00


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
5643,2004-10-31,21:00:00,4.1,1318,-200,17.5,1226,521,543,83,1848,1365,20.0,73.2,1.6965,2004-10-31 21:00:00
5644,2004-10-31,22:00:00,2.7,1229,-200,13.9,1114,352,598,76,1723,1247,19.7,73.7,1.6708,2004-10-31 22:00:00
5645,2004-10-31,23:00:00,2.6,1187,-200,13.6,1105,321,611,68,1700,1183,19.9,72.6,1.6681,2004-10-31 23:00:00


Shape:  (5646, 16)


In [7]:
display(df_valid.head(3))
display(df_valid.tail(3))
print("Shape: ", df_valid.shape)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
0,2004-11-01,0:00:00,3.2,1353,-200,15.9,1176,318,584,69,1723,2150,20.1,71.3,1.6564,2004-11-01 00:00:00
1,2004-11-01,1:00:00,3.7,1407,-200,17.8,1235,338,548,77,1780,2519,20.1,71.1,1.6498,2004-11-01 01:00:00
2,2004-11-01,2:00:00,3.5,1333,-200,16.7,1201,353,552,68,1767,1925,19.6,73.2,1.6543,2004-11-01 02:00:00


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
1461,2004-12-31,21:00:00,-200.0,974,-200,5.5,790,-200,915,-200,895,951,9.1,32.5,0.3766,2004-12-31 21:00:00
1462,2004-12-31,22:00:00,-200.0,1055,-200,5.6,791,-200,845,-200,936,1195,9.1,37.2,0.431,2004-12-31 22:00:00
1463,2004-12-31,23:00:00,-200.0,1003,-200,4.6,744,-200,882,-200,899,1138,7.8,38.4,0.4085,2004-12-31 23:00:00


Shape:  (1464, 16)


In [8]:
display(df_test.head(3))
display(df_test.tail(3))
print("Shape: ", df_test.shape)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
0,2005-01-01,0:00:00,-200.0,1046,-200,4.2,724,-200,848,-200,898,1201,8.2,40.1,0.4375,2005-01-01 00:00:00
1,2005-01-01,1:00:00,1.6,1275,-200,8.8,930,215,649,106,1024,1617,5.3,50.7,0.4564,2005-01-01 01:00:00
2,2005-01-01,2:00:00,2.5,1173,-200,7.5,878,300,738,129,1002,1355,5.9,50.0,0.4689,2005-01-01 02:00:00


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
2244,2005-04-04,12:00:00,2.4,1142,-200,12.4,1063,293,603,175,1241,1092,26.9,18.3,0.6406,2005-04-04 12:00:00
2245,2005-04-04,13:00:00,2.1,1003,-200,9.5,961,235,702,156,1041,770,28.3,13.5,0.5139,2005-04-04 13:00:00
2246,2005-04-04,14:00:00,2.2,1071,-200,11.9,1047,265,654,168,1129,816,28.5,13.1,0.5028,2005-04-04 14:00:00


Shape:  (2247, 16)


In [9]:
print("%Training Dataset            :", round(df_train.shape[0]/df.shape[0],4))
print("%Validation Dataset Shape    :", round(df_valid.shape[0]/df.shape[0],4))
print("%Test Dataset Shape          :", round(df_test.shape[0]/df.shape[0],4))

%Training Dataset            : 0.6034
%Validation Dataset Shape    : 0.1565
%Test Dataset Shape          : 0.2401


In [10]:
df_train.to_csv("train_dataset.csv", index=False)
df_valid.to_csv("validation_dataset.csv", index=False)