### Описание задачи

Из нейросети получены данные о заруженности в виде количества машин на перекрестке.
Задача спрогнозировать количество траффика в нужный момент времени

### Импорты

In [None]:
# pip install ephem

In [56]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
import ephem

### Генерация данных

In [75]:
# Set the desired date range
start_date = '2023-01-01'
end_date = '2023-12-31'
date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Generate fake data with peak hours
peak_hours = pd.date_range(start='07:00', end='10:00', freq='H').append(
    pd.date_range(start='17:00', end='20:00', freq='H')
)

# Generate data with higher car counts during peak hours on working days
data = {
    'date': date_range,
    'number_of_cars': [
        np.random.randint(50, 100) if date.hour in peak_hours and date.weekday() < 5 else np.random.randint(0, 50)
        for date in date_range
    ]
}

df = pd.DataFrame(data)

# Convert the 'date' column to datetime index
df.set_index('date', inplace=True)


In [76]:


def generate_temperature(date):
    observer = ephem.Observer()
    observer.lat = '45.0355'  # Latitude of Krasnodar, Russia
    observer.date = date

    sunrise = observer.previous_rising(ephem.Sun()).datetime()
    sunset = observer.next_setting(ephem.Sun()).datetime()

    temperature_range = {
        'spring': (-5, 20),    # Temperature range for spring
        'summer': (20, 35),    # Temperature range for summer
        'autumn': (5, 20),     # Temperature range for autumn
        'winter': (-5, 10)     # Temperature range for winter
    }

    if sunrise < date < sunset:  # Daytime temperature
        if 80 <= date.timetuple().tm_yday <= 171:  # Spring
            return np.random.randint(*temperature_range['spring'])
        elif 172 <= date.timetuple().tm_yday <= 264:  # Summer
            return np.random.randint(*temperature_range['summer'])
        elif 265 <= date.timetuple().tm_yday <= 355:  # Autumn
            return np.random.randint(*temperature_range['autumn'])
        else:  # Winter
            return np.random.randint(*temperature_range['winter'])
    else:  # Nighttime temperature
        return np.random.randint(-10, 5)  # Assume slightly colder temperature during nighttime

# Add temperature column to the dataframe
df['temperature'] = [generate_temperature(date) for date in df.index]


### Вcпомогательные функции

In [77]:
def make_features(data, max_lag, rolling_mean_size):
    """makes additional features for timeseries dataset"""
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['day'] = data.index.day
    data['dayofweek'] = data.index.dayofweek
    for i in range(1, max_lag + 1):
        column_name = "lag_" + str(i)
        data[column_name] = data['number_of_cars'].shift(i)
    data['rolling_mean'] = data['lag_1'].rolling(rolling_mean_size).mean()
    
    return data


### Разделение выборки на тренировочную и тестовую

Применяем специальный сплит для временных рядов, чтобы не было утечки данных

In [78]:
tscv = TimeSeriesSplit(n_splits=5)
scoring_metric='neg_mean_squared_error'

In [79]:
def make_features(data, max_lag, rolling_mean_size):
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['day'] = data.index.day
    data['dayofweek'] = data.index.dayofweek
    
    lag_cols = []
    for i in range(1, max_lag + 1):
        column_name = "lag_" + str(i)
        data[column_name] = data['number_of_cars'].shift(i)
        lag_cols.append(column_name)
    
    data['rolling_mean'] = data[lag_cols].rolling(rolling_mean_size).mean().values[:, -1]
    
    return data


### Машинное обучение
### Подбор параметров модели

In [81]:
best_rmse = 60
lag = 0
window = 0

for i in range(5, 100, 5):
    for k in range(2, 50, 5):
        data = df.copy(deep=True)
        make_features(data, i, k)
        data = data.dropna()
        target = data['number_of_cars']
        features = data.drop(['number_of_cars'], axis=1)
        
        model_lin_regression = LinearRegression()
        scores = cross_val_score(model_lin_regression, features, target, cv=tscv, scoring=scoring_metric)
        rmse = np.sqrt(np.abs(np.mean(scores)))
        
        if rmse < best_rmse:
            best_rmse = rmse
            lag = i
            window = k

print('Best RMSE:', best_rmse)
print('Lag:', lag)
print('Window:', window)


Best RMSE: 14.296389589604933
Lag: 5
Window: 12


In [None]:


best_rmse = 60
lag = 0
window = 0

for i in range(5, 100, 5):
    for k in range(2, 50, 5):
        data = df.copy(deep=True)
        make_features(data, i, k)
        data = data.dropna()
        target = data['number_of_cars']
        features = data.drop(['number_of_cars'], axis=1)
        
        model_lgbm = lgb.LGBMRegressor()
        scores = cross_val_score(model_lgbm, features, target, cv=tscv, scoring=scoring_metric)
        rmse = np.sqrt(np.abs(np.mean(scores)))
        
        if rmse < best_rmse:
            best_rmse = rmse
            lag = i
            window = k

print('Best RMSE:', best_rmse)
print('Lag:', lag)
print('Window:', window)


### Выводы

1. Можем улучшать качество модели за счет подбора гиперпараметров

2. Можем улучшать качество модели за счет добавления новых признаков

3. В дальнейшем при добавлении данных с других соседних перекрестков, мы можем использовать более сложные алгоритмы 

  Autoregressive Integrated Moving Average (ARIMA), 

  Seasonal ARIMA (SARIMA), 

  and Exponential Smoothing (ETS) models 


4. Recurrent Neural Networks (RNNs) and Long Short-Term Memory (LSTM) networks, по мере добавления данных с новых перекрестков


5.Подключение api погоды 

In [71]:
pip install ephem

Defaulting to user installation because normal site-packages is not writeable
Collecting ephem
  Downloading ephem-4.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m327.2 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[0mInstalling collected packages: ephem
Successfully installed ephem-4.1.4
Note: you may need to restart the kernel to use updated packages.


In [72]:
import ephem

def generate_temperature(date):
    observer = ephem.Observer()
    observer.lat = '45.0355'  # Latitude of Krasnodar, Russia
    observer.date = date

    sunrise = observer.previous_rising(ephem.Sun()).datetime()
    sunset = observer.next_setting(ephem.Sun()).datetime()

    temperature_range = {
        'spring': (-5, 20),    # Temperature range for spring
        'summer': (20, 35),    # Temperature range for summer
        'autumn': (5, 20),     # Temperature range for autumn
        'winter': (-5, 10)     # Temperature range for winter
    }

    if sunrise < date < sunset:  # Daytime temperature
        if 80 <= date.timetuple().tm_yday <= 171:  # Spring
            return np.random.randint(*temperature_range['spring'])
        elif 172 <= date.timetuple().tm_yday <= 264:  # Summer
            return np.random.randint(*temperature_range['summer'])
        elif 265 <= date.timetuple().tm_yday <= 355:  # Autumn
            return np.random.randint(*temperature_range['autumn'])
        else:  # Winter
            return np.random.randint(*temperature_range['winter'])
    else:  # Nighttime temperature
        return np.random.randint(-10, 5)  # Assume slightly colder temperature during nighttime

# Add temperature column to the dataframe
df['temperature'] = [generate_temperature(date) for date in df.index]
