# Train Validation Test Split

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo 

pd.options.display.max_rows = 1000

In [2]:
air_quality = fetch_ucirepo(id=360)
df = air_quality.data.features.copy()

display(df.head())
print("Shape:", df.shape)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,3/10/2004,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,3/10/2004,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,3/10/2004,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,3/10/2004,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,3/10/2004,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


Shape: (9357, 15)


In [3]:
df['date_time'] = pd.to_datetime(df['Date'] + " " + df['Time'], errors='coerce')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

df[['Date','date_time']].isnull().sum()

Date         0
date_time    0
dtype: int64

In [4]:
df = df.sort_values(by = ['date_time'], ascending=True).reset_index(drop = True)

display(df.head(3))
display(df.tail(3))

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
0,2004-03-10,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578,2004-03-10 18:00:00
1,2004-03-10,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255,2004-03-10 19:00:00
2,2004-03-10,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502,2004-03-10 20:00:00


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
9354,2005-04-04,12:00:00,2.4,1142,-200,12.4,1063,293,603,175,1241,1092,26.9,18.3,0.6406,2005-04-04 12:00:00
9355,2005-04-04,13:00:00,2.1,1003,-200,9.5,961,235,702,156,1041,770,28.3,13.5,0.5139,2005-04-04 13:00:00
9356,2005-04-04,14:00:00,2.2,1071,-200,11.9,1047,265,654,168,1129,816,28.5,13.1,0.5028,2005-04-04 14:00:00


In [5]:
df_train = df[df['Date']<="2004-11-30"].reset_index(drop = True)
df_valid = df[(df['Date']>"2004-11-30")&(df['Date']<="2005-01-31")].reset_index(drop = True)
df_test = df[df['Date']>"2005-01-31"].reset_index(drop = True)

In [6]:
display(df_train.head(3))
display(df_train.tail(3))
print("Shape: ", df_train.shape)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
0,2004-03-10,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578,2004-03-10 18:00:00
1,2004-03-10,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255,2004-03-10 19:00:00
2,2004-03-10,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502,2004-03-10 20:00:00


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
6363,2004-11-30,21:00:00,4.1,1157,-200,12.4,1063,491,905,160,1394,1223,10.7,73.9,0.9502,2004-11-30 21:00:00
6364,2004-11-30,22:00:00,2.3,1061,-200,8.0,899,284,1067,117,1277,939,9.6,83.0,0.9899,2004-11-30 22:00:00
6365,2004-11-30,23:00:00,1.9,1006,-200,6.5,836,245,1226,113,1210,902,9.6,79.9,0.9579,2004-11-30 23:00:00


Shape:  (6366, 16)


In [7]:
display(df_valid.head(3))
display(df_valid.tail(3))
print("Shape: ", df_valid.shape)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
0,2004-12-01,0:00:00,2.2,1039,-200,7.5,880,273,1198,113,1251,933,9.8,79.4,0.9626,2004-12-01 00:00:00
1,2004-12-01,1:00:00,1.3,886,-200,3.8,699,157,1900,94,1106,766,10.1,75.3,0.9322,2004-12-01 01:00:00
2,2004-12-01,2:00:00,1.2,900,-200,3.8,701,117,1955,76,1127,749,9.1,80.0,0.9299,2004-12-01 02:00:00


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
1485,2005-01-31,21:00:00,3.7,1283,-200,15.0,1149,586,587,235,1148,1799,6.3,39.4,0.3795,2005-01-31 21:00:00
1486,2005-01-31,22:00:00,2.1,1109,-200,7.1,863,360,737,208,960,1527,5.4,43.5,0.3922,2005-01-31 22:00:00
1487,2005-01-31,23:00:00,1.4,1055,-200,5.4,782,231,798,161,940,1377,5.1,45.6,0.4056,2005-01-31 23:00:00


Shape:  (1488, 16)


In [8]:
display(df_test.head(3))
display(df_test.tail(3))
print("Shape: ", df_test.shape)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
0,2005-02-01,0:00:00,1.3,1038,-200,4.8,756,200,813,146,921,1317,4.2,49.4,0.412,2005-02-01 00:00:00
1,2005-02-01,1:00:00,1.1,996,-200,3.8,698,153,871,123,894,1228,4.1,50.1,0.415,2005-02-01 01:00:00
2,2005-02-01,2:00:00,1.0,995,-200,3.5,680,135,879,109,904,1201,4.1,51.5,0.4263,2005-02-01 02:00:00


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,date_time
1500,2005-04-04,12:00:00,2.4,1142,-200,12.4,1063,293,603,175,1241,1092,26.9,18.3,0.6406,2005-04-04 12:00:00
1501,2005-04-04,13:00:00,2.1,1003,-200,9.5,961,235,702,156,1041,770,28.3,13.5,0.5139,2005-04-04 13:00:00
1502,2005-04-04,14:00:00,2.2,1071,-200,11.9,1047,265,654,168,1129,816,28.5,13.1,0.5028,2005-04-04 14:00:00


Shape:  (1503, 16)


In [9]:
print("%Training Dataset            :", round(df_train.shape[0]/df.shape[0],4))
print("%Validation Dataset Shape    :", round(df_valid.shape[0]/df.shape[0],4))
print("%Test Dataset Shape          :", round(df_test.shape[0]/df.shape[0],4))

%Training Dataset            : 0.6803
%Validation Dataset Shape    : 0.159
%Test Dataset Shape          : 0.1606


In [10]:
df_train.to_csv("train_dataset.csv", index=False)
df_valid.to_csv(r"..\phase_3_model_prediction" + r"\validation_dataset.csv", index=False)