In [45]:
import dask as da
import dask.dataframe as daskdf
import dask.array as daa
import dask.distributed as dd
import dask.datasets as ds
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import seaborn as sns
import sklearn as sk
import numpy as np
import dask_ml.preprocessing as dm_pre
import dask_ml.cluster as dm_cluster
import dask_geopandas as dg
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import geopandas as gpd
from scipy.stats import pearsonr
import time

In [50]:
"""data look like this: ;mapped_veh_id;timestamps_UTC;lat;lon;RS_E_InAirTemp_PC1;RS_E_InAirTemp_PC2;RS_E_OilPress_PC1;RS_E_OilPress_PC2;RS_E_RPM_PC1;RS_E_RPM_PC2;RS_E_WatTemp_PC1;RS_E_WatTemp_PC2;RS_T_OilTemp_PC1;RS_T_OilTemp_PC2
0;181;2023-08-01 03:44:12;50.7698183;3.8721144;27.0;23.0;255.0;238.0;794.0;801.0;83.0;81.0;76.0;77.0
1;143;2023-08-01 06:36:29;51.0399934;3.6934285;33.0;32.0;272.0;324.0;802.0;804.0;78.0;78.0;73.0;74.0
2;183;2023-08-24 06:53:54;50.7422026;3.6020347;31.0;33.0;234.0;182.0;799.0;802.0;82.0;82.0;85.0;87.0"""
# Load data
def load_data():
    print("Loading data...")
    start = time.time()
    df = daskdf.read_csv("ar41_for_ulb.csv", sep=';')
    end = time.time()
    print("Data loaded in {} seconds".format(end-start))
    return df

ddf = load_data()
ddf = ddf.drop(columns=['Unnamed: 0'])
# convert all temperatures from celcius to kelvin
ddf['RS_E_InAirTemp_PC1'] = ddf['RS_E_InAirTemp_PC1'] + 273.15
ddf['RS_E_InAirTemp_PC2'] = ddf['RS_E_InAirTemp_PC2'] + 273.15
ddf['RS_E_WatTemp_PC1'] = ddf['RS_E_WatTemp_PC1'] + 273.15
ddf['RS_E_WatTemp_PC2'] = ddf['RS_E_WatTemp_PC2'] + 273.15
ddf['RS_T_OilTemp_PC1'] = ddf['RS_T_OilTemp_PC1'] + 273.15
ddf['RS_T_OilTemp_PC2'] = ddf['RS_T_OilTemp_PC2'] + 273.15


Loading data...
Data loaded in 0.045198917388916016 seconds


# Phase 3: Data Preparation
This phase covers constructing the final data set for modeling tools. Steps include selecting data, cleansing data, constructing data, integrating data, and formatting data.

Select Data:

Decide on data based on relevance, quality, and technical constraints.
Explain data inclusions/exclusions and prioritize attributes.
Clean Data:

Ensure data cleanliness, address missing values using appropriate techniques.
Document how quality problems were addressed.
Construct Data:

Develop new records or derived attributes, considering modeling needs.
Example: Create "income per head" as a derived attribute.
Integrate Data:

Combine information from multiple tables or records.
Perform aggregations to summarize information.
Format Data:

Change data format or design to suit modeling tools.
Example: Trim strings, reorganize information.

In [52]:
# Select Data: Filtering out all data before january 2023
def select_data(df):
    ddf = df[df['timestamps_UTC'] >= '2023-01-01']
    # to drop the erratic data and keep only those in 10-90 percentile for every feature
    for col in ddf.columns:
        if col not in ['timestamps_UTC', 'lat', 'lon', 'mapped_veh_id']:
            ddf = ddf[(ddf[col] > ddf[col].quantile(0.001)) & (ddf[col] < ddf[col].quantile(0.999))]

    return ddf

ddfprep = select_data(ddf)
#index fix 
ddfprep = ddfprep.reset_index(drop=True)



In [53]:
# AS seen in exploratory analysis, we have some no duplicates at global level and nan values only for PC2 on 12726 rows over 17M rows so we can drop them
#ddf = ddf.drop_duplicates()
#ddf = ddf.dropna()

print (len (ddfprep)) 
#print(ddf.head())

## TODO: Construct Data if needed here 

15589036


In [57]:
# we will cluster the lan and lon and associate to each row the name of a city centroid of the cluster 

# first city = Charleroi 
# if lat < 50.522345 city_centroid = Charleroi
ddf = ddf.set_index('lat')  # Set 'lat' as the index

ddf['city'] = daskdf.from_array(np.where(ddf.index < 50.522345,
                                     np.where(ddf['lon'] < 4.586875, 'Charleroi', 'Namur'),
                                     np.where(ddf['lon'] < 4.073960, 'Gent',
                                              np.where((4.073960 < ddf['lon']) & (ddf['lon'] < 4.786476),
                                                       np.where(ddf.index < 50.522345, 'Antwerp', 'Brussels'),
                                                       'Hasselt'))))

ValueError: operands could not be broadcast together with shapes (nan,) (17679273,) (nan,)

In [None]:
# Integrate external data 
# here we will join external weather data with our data 
# for that we will perform join over the city and timestamp.

# Load external data
def load_external_data():
    print("Loading external data...")
    start = time.time()
    df = daskdf.read_csv("weather.csv")
    end = time.time()
    print("External data loaded in {} seconds".format(end-start))
    return df

external_df = load_external_data()

# Convert 'timestamps_UTC' to a datetime object for better handling
external_df['timestamps_UTC'] = dd.to_datetime(external_df['timestamps_UTC'])

# create index for join
ddf['timestamps_UTC'] = dd.to_datetime(ddf['timestamps_UTC'])
ddf['date'] = ddf['timestamps_UTC'].dt.date
ddf['date'] = dd.to_datetime(ddf['date'])
ddf = ddf.set_index(['city', 'date'])

external_df['date'] = dd.to_datetime(external_df['date'])
external_df = external_df.set_index(['city', 'date'])

# Join external data with our data
ddf = ddf.join(external_df, how='left') # this will join data from external_df to ddf on the index 

# we will fill the nan values with the mean of the column
if ddf.isnull().values.any():
    print(" nan finded  will be filled with the mean of the column")
    ddf = ddf.fillna(ddf.mean())

# we will drop the columns that we don't need anymore
print(ddf.head())
ddf = ddf.drop(columns=['lat', 'lon', 'date', 'hour', 'city','longitude', 'latitude' ])
print(ddf.head())

# we will drop the duplicates
if ddf.duplicated().values.any():
    print("duplicates finded  will be dropped")
    ddf = ddf.drop_duplicates()



Loading external data...


NameError: name 'time' is not defined

# Phase Four: Modeling. 
This phase involves selecting, applying, and tuning various statistical or machine learning models to your prepared data. Here's what typically happens in this phase:

In [None]:
# Modeling techniques selection

