In [2]:
import pandas as pd

In [4]:
data=pd.read_csv('AirQuality.csv', delimiter=';', on_bad_lines='skip')

In [6]:
#removing the false columns 
data_cleaned = data.drop(columns=["Unnamed: 15", "Unnamed: 16"], errors='ignore')

In [8]:
#fixing the columns name
columns_to_fix = ["CO(GT)", "C6H6(GT)", "T", "RH", "AH"]

for col in columns_to_fix:
    if col in data_cleaned.columns:
        # converting the datatype of columns to string and replacing ',' with '.'
        data_cleaned[col] = data_cleaned[col].astype(str).str.replace(',', '.').astype(float, errors='ignore')

In [28]:
# Remove rows with missing values in the "Date" or "Time" columns, as these fields are essential for further analysis.
data_cleaned.dropna(subset=["Date", "Time"], inplace=True)

In [30]:
# Remove duplicate rows from the dataset to ensure data integrity and avoid redundancy.
data_cleaned = data_cleaned.drop_duplicates()

In [32]:
#Convert the 'Date' column to datetime format using the specified format ('%d/%m/%Y').
# Any invalid date entries will be set as NaT (Not a Time) due to 'errors=coerce'.
data_cleaned['Date'] = pd.to_datetime(data_cleaned['Date'], format='%d/%m/%Y', errors='coerce')

In [34]:
# Convert the 'Time' column to a datetime object using the specified format ('%H.%M.%S').
# The result is then extracted as time (hours, minutes, and seconds) and stored in the 'Time' column.
# Invalid time entries will be set as NaT due to 'errors=coerce'.
data_cleaned['Time'] = pd.to_datetime(data_cleaned['Time'], format='%H.%M.%S', errors='coerce').dt.time

In [36]:
# Save the cleaned data to a CSV file named 'CleanedAirQuality.csv' without including the DataFrame's index.
data_cleaned.to_csv('CleanedAirQuality.csv', index=False)

In [26]:
data_cleaned.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,2004-03-10,19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,2004-03-10,20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,2004-03-10,21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,2004-03-10,22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
