In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("../Code/Dataset/mpi_roof.csv", encoding='latin-1')
data['Date Time'] = pd.to_datetime(data['Date Time'], format='%d.%m.%Y %H:%M:%S')
data = data.set_index("Date Time")

is_nan = data.isnull().values.any()
if (is_nan):
    print("There exist null values")
else:
    print("No null values")

#Check if exist a case when having no record of rain (0mm) but the raining(s)
checker =  not data.loc[(data['rain (mm)'] == 0) & (data['raining (s)'] > 0), ['rain (mm)', 'raining (s)']].empty
if (checker):
    print("There exist a case when having no record of rain (0mm) but the raining(s)")
else:
    print("There is no case when having no record of rain (0mm) but the raining(s)")


#Check if exist a case when having no record of raining (s) but rain
checker_2 = not data.loc[(data['rain (mm)'] > 0) & (data['raining (s)'] == 0), ['rain (mm)', 'raining (s)']].empty
if (checker_2):
    print("There exist a case")
else:
    print("There is no case")

resample_data = data.resample('6h').mean()

resample_data['Tpot (degC)'] = resample_data['Tpot (K)'] - 273.15
# resample_data = resample_data.drop(columns=['Tpot (K)']) Delete the Kelvin temperature column if neccessary

resample_data['Rain Rate (mm/h)'] = np.where((resample_data['raining (s)'] > 0), (resample_data['rain (mm)'] * 3600) / resample_data['raining (s)'], 0)
resample_data['Is Rain'] = np.where(resample_data['Rain Rate (mm/h)'] >= 0.5, 'Yes', 'No')

rain_condition = ['No Rain', 'Weak Rain', 'Moderate Rain', 'Heavy Rain', 'Very Heavy Rain', 'Shower', 'Cloudburst']
rain_rate = [resample_data['Rain Rate (mm/h)'] < 0.5,
             (resample_data['Rain Rate (mm/h)'] >= 0.5) & (resample_data['Rain Rate (mm/h)'] < 2),
             (resample_data['Rain Rate (mm/h)'] >= 2) & (resample_data['Rain Rate (mm/h)'] < 6),
             (resample_data['Rain Rate (mm/h)'] >= 6) & (resample_data['Rain Rate (mm/h)'] < 10),
             (resample_data['Rain Rate (mm/h)'] >= 10) & (resample_data['Rain Rate (mm/h)'] < 18),
             (resample_data['Rain Rate (mm/h)'] >= 18) & (resample_data['Rain Rate (mm/h)'] < 30),
             resample_data['Rain Rate (mm/h)'] >= 30]
resample_data['Rain Type'] = np.select(rain_rate, rain_condition, default='Unknown')
resample_data.head()
resample_data.to_csv("resampled.csv")

#Preprocessing data
scaler = StandardScaler()
encoder = OneHotEncoder(sparse_output=False)
regression_data = resample_data.drop(columns=['Is Rain', 'Rain Type'])
categorical_columns = resample_data.select_dtypes(include=['object']).columns.tolist()

one_hot_encoded = encoder.fit_transform(resample_data[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns), index=resample_data.index)

regression_data_scaler = scaler.fit_transform(regression_data)
regression_data_df = pd.DataFrame(regression_data_scaler, columns=regression_data.columns, index=regression_data.index)

preprocessed_data = pd.concat([regression_data_df, one_hot_df], axis=1)
preprocessed_data.to_csv("preprocessed.csv")
preprocessed_data.head()


No null values
There exist a case when having no record of rain (0mm) but the raining(s)
There exist a case


Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),...,Rain Rate (mm/h),Is Rain_No,Is Rain_Yes,Rain Type_Cloudburst,Rain Type_Heavy Rain,Rain Type_Moderate Rain,Rain Type_No Rain,Rain Type_Shower,Rain Type_Very Heavy Rain,Rain Type_Weak Rain
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-01 00:00:00,-1.03891,-0.607535,-0.495156,-0.753116,-0.172449,-0.683671,-0.816243,-0.354583,-0.79787,-0.798365,...,-0.216147,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2024-01-01 06:00:00,-0.97722,-0.653649,-0.546302,-0.73843,-0.025738,-0.713803,-0.806097,-0.41065,-0.788777,-0.788717,...,-0.216147,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2024-01-01 12:00:00,-0.864557,-0.569601,-0.474746,-0.667503,-0.089449,-0.657749,-0.752443,-0.36999,-0.736995,-0.737199,...,-0.216147,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2024-01-01 18:00:00,-0.635749,-0.875138,-0.796193,-0.725994,0.610058,-0.850614,-0.795767,-0.632994,-0.783864,-0.784152,...,-0.005845,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2024-01-02 00:00:00,-0.864705,-0.928105,-0.826135,-0.643314,0.972406,-0.88221,-0.734726,-0.735488,-0.719308,-0.719306,...,-0.119468,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
