In [60]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.linear_model import LinearRegression
from datetime import datetime
import holidays


In [61]:
def clean(df, df_name):
    missing_values_count = df.isnull().any(axis=1).sum()
    print(f"({df_name}) : Number of lines with missing values: {missing_values_count}")
    df = df.dropna()
    return df


def _encode_dates_bike(df):
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["hour"] = df["date"].dt.hour
    df["weekday"] = df["date"].dt.weekday
    df['IsWeekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)

    vacances = holidays.CountryHoliday('France', years=[2020, 2021])
    vacances_dates = pd.to_datetime(list(vacances.keys())).date
    df["IsHolidays"] = df["date"].isin(vacances_dates).astype(int)

    return df


def _encode_dates_meteo(df, min_date, max_date):   
    df = df[(df['DATE'] > min_date) & (df['DATE'] < max_date)]
    df['DATE'] = pd.to_datetime(df['DATE'])
    df["year"] = df["DATE"].dt.year
    df["month"] = df["DATE"].dt.month
    df["day"] = df["DATE"].dt.day
    return df



In [62]:
dataaaaaaa = pd.read_parquet(Path("data") / "train.parquet")
data_meteooooooooooo = pd.read_csv('external_data/export-paris0.csv')


def transform_data(bike_counter_data, data_meteo):
    
    bike_counter_data = clean(bike_counter_data, 'bike_counter_data')
    bike_counter_data = _encode_dates_bike(bike_counter_data)
    #print(bike_counter_data.info())

    min_date_bike = bike_counter_data['date'].min().strftime('%Y-%m-%d')
    max_date_bike = bike_counter_data['date'].max().strftime('%Y-%m-%d')

    data_meteo = data_meteo.drop(columns = ['TEMPERATURE_NIGHT_C', 'SUNRISE', 'SUNSET'] )
    data_meteo = clean(data_meteo, 'data_meteo')
    data_meteo = _encode_dates_meteo(data_meteo, min_date_bike, max_date_bike)
    #print(data_meteo.info())    

    merged_data = pd.merge(bike_counter_data, data_meteo, on=['year', 'month', 'day'])

    columns_to_drop = ["day", "counter_name", "site_name", "bike_count", "DATE", "counter_installation_date", "coordinates", "counter_technical_id", "latitude", "longitude"]
    merged_data = merged_data.drop(columns = columns_to_drop)
    
    return merged_data



In [63]:
transform_data(dataaaaaaa, data_meteooooooooooo).info()

(bike_counter_data) : Number of lines with missing values: 0
(data_meteo) : Number of lines with missing values: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DATE'] = pd.to_datetime(df['DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["year"] = df["DATE"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = df["DATE"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

<class 'pandas.core.frame.DataFrame'>
Int64Index: 494241 entries, 0 to 494240
Data columns (total 31 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   counter_id              494241 non-null  category      
 1   site_id                 494241 non-null  int64         
 2   date                    494241 non-null  datetime64[ns]
 3   log_bike_count          494241 non-null  float64       
 4   year                    494241 non-null  int64         
 5   month                   494241 non-null  int64         
 6   hour                    494241 non-null  int64         
 7   weekday                 494241 non-null  int64         
 8   IsWeekend               494241 non-null  int64         
 9   IsHolidays              494241 non-null  int64         
 10  MAX_TEMPERATURE_C       494241 non-null  int64         
 11  MIN_TEMPERATURE_C       494241 non-null  int64         
 12  WINDSPEED_MAX_KMH       494241