In [93]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.linear_model import LinearRegression
from datetime import datetime
import holidays

In [94]:
def clean(df, df_name):
    missing_values_count = df.isnull().any(axis=1).sum()
    print(f"({df_name}) : Number of lines with missing values: {missing_values_count}")
    df = df.dropna()
    return df

def _encode_dates_bike(df):
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["hour"] = df["date"].dt.hour
    df["weekday"] = df["date"].dt.weekday
    df['IsWeekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)

    vacances = holidays.CountryHoliday('France', years=[2020, 2021])
    vacances_dates = pd.to_datetime(list(vacances.keys())).date
    df["IsHolidays"] = df["date"].isin(vacances_dates).astype(int)

    return df

def _encode_dates_meteo(df, min_date, max_date):   
    df = df[(df['DATE'] > min_date) & (df['DATE'] < max_date)]
    df['DATE'] = pd.to_datetime(df['DATE'])
    df["year"] = df["DATE"].dt.year
    df["month"] = df["DATE"].dt.month
    df["day"] = df["DATE"].dt.day
    return df

In [95]:
def transform_data(raw_bike_counter_data, raw_data_meteo):

    bike_counter_data = raw_bike_counter_data.copy()
    data_meteo = raw_data_meteo.copy()
    
    bike_counter_data = clean(bike_counter_data, 'bike_counter_data')
    bike_counter_data = _encode_dates_bike(bike_counter_data)
    #print(bike_counter_data.info())

    min_date_bike = bike_counter_data['date'].min().strftime('%Y-%m-%d')
    max_date_bike = bike_counter_data['date'].max().strftime('%Y-%m-%d')

    data_meteo = data_meteo.drop(columns = ['TEMPERATURE_NIGHT_C', 'SUNRISE', 'SUNSET'] )
    data_meteo = clean(data_meteo, 'data_meteo')
    data_meteo = _encode_dates_meteo(data_meteo, min_date_bike, max_date_bike)
    #print(data_meteo.info())    

    merged_data = pd.merge(bike_counter_data, data_meteo, on=['year', 'month', 'day'])

    columns_to_drop = ["day", "counter_name", "site_name", "DATE", "counter_installation_date", "coordinates", "counter_technical_id", "latitude", "longitude"]
    if 'bike_count' in merged_data.columns:
        merged_data = merged_data.drop(columns = 'bike_count')
    merged_data = merged_data.drop(columns = columns_to_drop)
    
    return merged_data



In [96]:
raw_bike_counter_data = pd.read_parquet(Path("data") / "train.parquet")
raw_meteo_data = pd.read_csv('external_data/export-paris0.csv')
raw_bike_counter_test_data = pd.read_parquet(Path("data") / "final_test.parquet")

train_data = transform_data(raw_bike_counter_data, raw_meteo_data)
public_test_data = transform_data(raw_bike_counter_test_data, raw_meteo_data)

(bike_counter_data) : Number of lines with missing values: 0
(data_meteo) : Number of lines with missing values: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DATE'] = pd.to_datetime(df['DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["year"] = df["DATE"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = df["DATE"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

(bike_counter_data) : Number of lines with missing values: 0
(data_meteo) : Number of lines with missing values: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DATE'] = pd.to_datetime(df['DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["year"] = df["DATE"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = df["DATE"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

In [97]:
#train_data.info()
#public_test_data.info()