In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

In [3]:
train_df.head()

Unnamed: 0,ID,date,device,site_latitude,site_longitude,humidity,temp_mean,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,pm2_5
0,ID_JBYXJEUZ8X,2020-03-18,ANQ16PZJ,0.353465,32.560322,0.846844,22.038542,-0.000138,1.042629,-0.000143,...,4181.792969,69589.453125,3181.793457,7.799846,0.278111,76.618553,9.970293,-93.110069,19.214193,63.853333
1,ID_PEJJNLBUPR,2019-10-13,AB6051M4,0.390741,32.582257,0.874493,21.545833,,,,...,,,,,,,,,,32.507083
2,ID_21L8UZQO1T,2019-12-21,A0WN66FH,0.285751,32.578325,0.72934,23.568655,,,,...,4440.391113,67341.109375,3440.390869,62.49659,0.416631,-97.300758,63.690605,-127.856804,40.857922,80.290833
3,ID_GNW2G8J4T8,2020-02-20,aq_41,-1.244985,29.989236,0.919271,16.338542,-0.000111,0.750228,-8.3e-05,...,,,,,,,,,,42.601687
4,ID_4XOX2V6SK0,2019-04-17,aq_41,-1.244985,29.989236,0.858365,16.602083,-0.000127,0.792634,-0.000101,...,4842.057488,64347.788635,3842.057585,4.971117,0.277157,74.786025,32.58307,-57.273638,21.864529,89.642229


In [4]:
train_df.isna().sum()

ID                               0
date                             0
device                           0
site_latitude                    0
site_longitude                   0
                              ... 
Cloud_sensor_azimuth_angle    1509
Cloud_sensor_zenith_angle     1509
Cloud_solar_azimuth_angle     1509
Cloud_solar_zenith_angle      1509
pm2_5                            0
Length: 71, dtype: int64

In [5]:
test_df.head()

Unnamed: 0,ID,date,device,site_latitude,site_longitude,humidity,temp_mean,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,...,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle
0,ID_UOH62J0XHX,2020-10-02,aq_43,0.28904,32.58958,0.75684,23.849653,0.000124,0.839054,0.000104,...,36716.141728,8325.718024,41969.524699,7325.718482,2.632129,0.21043,74.431975,35.44977,-101.546713,21.00606
1,ID_4OPWSB0UHJ,2020-12-07,aq_54,0.3564,32.573,0.810854,22.075347,,,,...,63086.023438,4060.879639,70871.984375,3060.879639,2.773685,0.344436,-97.300392,63.489811,-125.937134,41.709042
2,ID_3SLMNNG1Z3,2020-11-16,aq_59,0.381576,32.647109,0.883253,20.050694,,,,...,46783.82082,6456.160528,53084.224103,5456.160528,3.862119,0.326882,-97.699519,58.82412,-120.683819,40.18368
3,ID_MFUHTXYPM4,2020-09-05,AW66FF7V,0.379562,32.553714,0.782583,22.248958,0.000176,0.730853,0.000129,...,45528.261719,6682.570801,51503.203125,5682.570801,1.937542,0.212181,75.361595,24.835907,-72.224319,20.675266
4,ID_TY1DAND8ZP,2020-12-15,aq_45,0.344,32.553,0.770116,22.565217,,,,...,65512.797785,3757.637022,73461.827324,2757.637022,3.733908,0.262021,76.53708,11.026513,-139.07262,32.04636


In [6]:
test_df.isna().sum()

ID                              0
date                            0
device                          0
site_latitude                   0
site_longitude                  0
                             ... 
Cloud_surface_albedo          515
Cloud_sensor_azimuth_angle    515
Cloud_sensor_zenith_angle     515
Cloud_solar_azimuth_angle     515
Cloud_solar_zenith_angle      515
Length: 70, dtype: int64

# Cretae date features

In [7]:
def create_date_features(df: pd.DataFrame, date_feature_column: str = 'date', drop_features: list = []) -> pd.DataFrame:
    ''' This function converts the "date" column into pandas datetime.
        creates new features with year, month, and day

        Parameters:
        - df: pd.DataFrame
        - date_feature_column: str
            it is a date column, this column is used to create new date features

        Returns:
        - pd.DataFrame
    '''
    df[date_feature_column] = pd.to_datetime(df[date_feature_column])
    df['year'] = df[date_feature_column].dt.year
    df['month'] = df[date_feature_column].dt.month
    df['day'] = df[date_feature_column].dt.day

    df['year'] = df['year'].astype(dtype='float64')
    df['month'] = df['month'].astype(dtype='float64')
    df['day'] = df['day'].astype(dtype='float64')

    df.drop(
        labels=[date_feature_column],
        axis=1,
        inplace=True
    )

    if len(drop_features):
        df.drop(
        labels=drop_features,
        axis=1,
        inplace=True
    )

    return df

In [8]:
train_data = create_date_features(
    df=train_df,
    drop_features=['ID', 'device']
)
test_data = create_date_features(
    df=test_df,
    drop_features=['ID', 'device']
)

In [9]:
def fill_na_values(df: pd.DataFrame, target: str = '') -> pd.DataFrame:
    ''' This function will fill all the na values in categorical and numerical features of a datafreame.\n
        For all the categorical features it will fill na with "None".\n
        For all the numerical features it will fill na with median value.\n
        Also a new feature is generated with the name of feature_na with either 0 or 1 value where 1 for if value is na.

        Parameters:
        - df: pd.DataFrame
        - target: str
            target column name in the dataframe, only use this for dataframe with target column

        Returns:
        - pd.DataFrame
    '''
    target = [target]

    categorical_features = []
    for column in df.columns:
        if df[column].dtype == 'object' and column not in target:
            categorical_features.append(column)
    for cf in categorical_features:
        df[cf].fillna(
            value='None',
            inplace=True
        )

    numerical_features = []
    for column in df.columns:
        if column not in categorical_features and column not in target:
            numerical_features.append(column)
    for nf in numerical_features:
        if df[nf].isna().sum() > 1:
            median_value = df[nf].median()
            df[f'{nf}_na'] = np.where(
                df[nf].isna(),
                1,
                0
            )
            df[nf].fillna(
                value=median_value,
                inplace=True
            )

    return df

In [10]:
train_data = fill_na_values(
    df=train_data,
    target='pm2_5'
)
test_data = fill_na_values(df=test_data)

In [11]:
train_data.shape, test_data.shape

((9923, 135), (4254, 133))

In [12]:
for c in train_data.columns:
    if c not in test_data.columns:
        print(c)

pm2_5
temp_mean_na


# Note

Since there is a feature `temp_mean`, train data has missing values of this features while the test data does not have missing values in `temp_mean`, dropping that feature from train data.

In [13]:
train_data.drop(
    labels=['temp_mean_na'],
    axis=1,
    inplace=True
)

In [14]:
train_data.shape, test_data.shape

((9923, 134), (4254, 133))

In [15]:
import os

cwd = os.getcwd()

os.makedirs(
    name=os.path.join(
        cwd,
        'cleaned_dataset'
    ),
    exist_ok=True
)

cleaned_dataset_dir = os.path.join(
    cwd,
    'cleaned_dataset'
)

In [16]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(
    train_data,
    test_size=0.2,
    random_state=2022
)

In [17]:
train_data.shape, valid_data.shape

((7938, 134), (1985, 134))

In [18]:
train_data.to_csv(
    os.path.join(
        cleaned_dataset_dir,
        'train.csv'
    ),
    index=False
)

In [19]:
valid_data.to_csv(
    os.path.join(
        cleaned_dataset_dir,
        'valid.csv'
    ),
    index=False
)

In [20]:
test_data.to_csv(
    os.path.join(
        cleaned_dataset_dir,
        'test.csv'
    ),
    index=False
)