In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, classification_report
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import math, time, os, datetime, psutil
import random

In [82]:
#DIFFERENCE
def set_seed(seed):
   torch.manual_seed(seed)
   torch.cuda.manual_seed_all(seed)
   np.random.seed(seed)
   random.seed(seed)
   torch.backends.cudnn.deterministic = True
   torch.backends.cudnn.benchmark = False


def data_transfo(path, spl_frac, evaluation=False, grids=None):
    if evaluation is True and grids is None :
        raise ValueError("Please enter the list of grids to be used")

    df = pd.read_parquet(path)
    df.columns=df.columns=['gridID', 'Time', 'Destination', 'SmsIn', 'SmsOut', 'CallIn','CallOut', 'Internet']

    if grids is not None:
        df=df[df['gridID'].isin(grids)]
        
    df['date'] = df.index.date  # extract date for daily grouping
    
    cols_selected = ['gridID', 'SmsIn', 'SmsOut', 'CallIn','CallOut', 'Internet']

    #### Regrouping data
    df = df.groupby(['gridID', 'Time']).agg({
        'Internet': ['mean'],
        'SmsIn': ['mean'],
        'SmsOut': ['mean'],
        'CallIn': ['mean'],
        'CallOut': ['mean'],
    }).reset_index()
    
    df.index = df['Time']
    df.drop(columns=['Time'], inplace=True)
    df.columns=cols_selected
    
    
    #### Imputation of missing values
    df['date']=df.index.date
    
    for col in cols_selected:
        df[col] = df.groupby(['gridID', 'date'])[col].transform(
            lambda x: x.fillna(x.mean())
        )
    
    df.drop(columns=['date'], axis=1, inplace=True)
    
    
    # df.index=df['Time']
    
    # df_copy = df.copy()
    
    df = df[cols_selected]
    
    display(df)
    
    ### Sampling
    # sample_fct=0.5
    grid_ids = np.random.choice(df['gridID'].unique(), int(len(df["gridID"].unique())*spl_frac))
    
    df=df[df['gridID'].isin(grid_ids)]
    # df = df[["gridID",col]]
    df.index.name='startTime'
    
    ### Data scaling
    scaler = StandardScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df[cols_selected]), columns=cols_selected, index=df.index)
    
    ## Adding contextual information
    data_scaled['gridID'] = df['gridID']
    
    # Create time-based features
    data_scaled['month'] = data_scaled.index.month 
    data_scaled['day_of_month'] = data_scaled.index.day
    data_scaled['day_of_week'] = data_scaled.index.dayofweek
    data_scaled['hour'] = data_scaled.index.hour
    data_scaled['minute'] = data_scaled.index.minute
    
    data_scaled.sort_index(inplace=True)
    
    display(data_scaled)

    return(data_scaled)

In [83]:
set_seed(5757)

In [114]:
data_orig.iloc[2:]

Unnamed: 0_level_0,gridID,Time,SmsIn,SmsOut,CallIn,CallOut,Internet
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-11-01 00:20:00,195,2013-11-01 00:20:00,0.083467,0.040774,0.040774,0.161408,11.482387
2013-11-01 00:30:00,195,2013-11-01 00:30:00,0.181795,0.040774,0.079859,0.080704,11.750915
2013-11-01 00:40:00,195,2013-11-01 00:40:00,0.120634,0.000000,0.080704,0.081549,13.019984
2013-11-01 00:50:00,195,2013-11-01 00:50:00,0.079859,0.120634,0.161408,0.122323,5.676654
2013-11-01 01:00:00,195,2013-11-01 01:00:00,0.040774,0.079859,0.040774,0.085385,9.849951
...,...,...,...,...,...,...,...
2013-11-16 23:10:00,9898,2013-11-16 23:10:00,0.171990,1.728157,0.366510,0.475037,57.424349
2013-11-16 23:20:00,9898,2013-11-16 23:20:00,0.434727,1.687225,0.475037,0.864078,43.642535
2013-11-16 23:30:00,9898,2013-11-16 23:30:00,0.626560,2.375183,0.128992,0.343979,19.471304
2013-11-16 23:40:00,9898,2013-11-16 23:40:00,1.020979,1.707691,0.237518,0.286649,44.217204


In [117]:
data_path = '../data/data_sampled_16d_2025-03-28.parquet'
data_orig = pd.read_parquet(data_path)
data_orig.columns=data_orig.columns=['gridID', 'Time', 'Destination', 'SmsIn', 'SmsOut', 'CallIn','CallOut', 'Internet']
data_orig = data_orig.drop('Destination', axis=1).groupby(["gridID", "Time"]).mean().fillna(0).reset_index()
data_orig.index = data_orig['Time']
# Data scaling
scaler = StandardScaler()
data_orig.iloc[:, 2:] = pd.DataFrame(scaler.fit_transform(data_orig.iloc[:, 2:]), columns=data_orig.columns[2:], index=data_orig.index)
display(data_orig)

Unnamed: 0_level_0,gridID,Time,SmsIn,SmsOut,CallIn,CallOut,Internet
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-11-01 00:00:00,195,2013-11-01 00:00:00,-0.412230,-0.406036,-0.472435,-0.492591,-0.426531
2013-11-01 00:10:00,195,2013-11-01 00:10:00,-0.447886,-0.414317,-0.512969,-0.467714,-0.348303
2013-11-01 00:20:00,195,2013-11-01 00:20:00,-0.437361,-0.414317,-0.502730,-0.467714,-0.367334
2013-11-01 00:30:00,195,2013-11-01 00:30:00,-0.413120,-0.414317,-0.492914,-0.492855,-0.364299
2013-11-01 00:40:00,195,2013-11-01 00:40:00,-0.428198,-0.422599,-0.492702,-0.492591,-0.349955
...,...,...,...,...,...,...,...
2013-11-16 23:10:00,9898,2013-11-16 23:10:00,-0.415537,-0.071599,-0.420928,-0.370011,0.151929
2013-11-16 23:20:00,9898,2013-11-16 23:20:00,-0.350763,-0.079913,-0.393674,-0.248816,-0.003841
2013-11-16 23:30:00,9898,2013-11-16 23:30:00,-0.303470,0.059816,-0.480576,-0.410839,-0.277039
2013-11-16 23:40:00,9898,2013-11-16 23:40:00,-0.206232,-0.075756,-0.453322,-0.428698,0.002654


In [118]:
data_scaled = data_transfo(path=data_path, spl_frac=1)
display(data_scaled[data_scaled['gridID']==195])

  df.drop(columns=['Time'], inplace=True)


Unnamed: 0_level_0,gridID,SmsIn,SmsOut,CallIn,CallOut,Internet
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-11-01 00:00:00,195,6.244960,0.185402,0.081549,0.161408,0.081549
2013-11-01 00:10:00,195,13.166190,0.040774,0.040774,0.254464,0.161408
2013-11-01 00:20:00,195,11.482387,0.083467,0.040774,0.040774,0.161408
2013-11-01 00:30:00,195,11.750915,0.181795,0.040774,0.079859,0.080704
2013-11-01 00:40:00,195,13.019984,0.120634,0.220933,0.080704,0.081549
...,...,...,...,...,...,...
2013-11-16 23:10:00,9898,57.424349,0.171990,1.728157,0.366510,0.475037
2013-11-16 23:20:00,9898,43.642535,0.434727,1.687225,0.475037,0.864078
2013-11-16 23:30:00,9898,19.471304,0.626560,2.375183,0.128992,0.343979
2013-11-16 23:40:00,9898,44.217204,1.020979,1.707691,0.237518,0.286649


Unnamed: 0_level_0,gridID,SmsIn,SmsOut,CallIn,CallOut,Internet,month,day_of_month,day_of_week,hour,minute
startTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-11-01 00:00:00,216,-0.398874,-0.430598,-0.386095,-0.587443,-0.607907,11,1,4,0,0
2013-11-01 00:00:00,9898,-0.086355,-0.364289,-0.205399,-0.419340,-0.269375,11,1,4,0,0
2013-11-01 00:00:00,7221,-0.400118,-0.380105,-0.317375,-0.543270,-0.622884,11,1,4,0,0
2013-11-01 00:00:00,7484,0.224454,-0.314975,-0.035175,-0.452000,-0.491661,11,1,4,0,0
2013-11-01 00:00:00,7488,0.300071,-0.386383,-0.204231,-0.516220,-0.531128,11,1,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2013-11-16 23:50:00,6568,1.899384,-0.042275,0.523472,0.191368,0.006477,11,16,5,23,50
2013-11-16 23:50:00,5892,-0.404579,-0.414909,-0.363578,-0.553635,-0.545548,11,16,5,23,50
2013-11-16 23:50:00,5873,-0.131706,0.041208,0.073740,-0.109992,-0.203798,11,16,5,23,50
2013-11-16 23:50:00,7933,0.174613,-0.343622,-0.336400,-0.436655,-0.476958,11,16,5,23,50


Unnamed: 0_level_0,gridID,SmsIn,SmsOut,CallIn,CallOut,Internet,month,day_of_month,day_of_week,hour,minute
startTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [63]:
grid_ids = data_scaled['gridID'].unique()
grid_ids

array([ 195,  794, 4513, 8773, 4560, 5063, 7839, 5216, 1935, 7488, 1717,
        726, 5447, 5697, 1667, 1515, 7226, 5763, 1367, 6470, 2002, 7984,
       1009, 3236, 3338, 2570, 8504, 8454, 3541, 3704, 2524, 8369,  927,
       3778, 2021, 8658, 8011, 3882, 4467,  317,  596,  216])

In [64]:
### Train test splitting
train_grid_ids, test_grid_ids = train_test_split(np.unique(grid_ids), test_size=0.4, shuffle=False)

In [120]:
data_eval_path = '../data/data_full_16_to_32_days.parquet'
data_eval_scaled = data_transfo(data_eval_path, spl_frac=1, evaluation=False, grids=train_grid_ids)

  df.drop(columns=['Time'], inplace=True)


Unnamed: 0_level_0,gridID,SmsIn,SmsOut,CallIn,CallOut,Internet
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-12-18 00:00:00,195,0.725444,0.067957,0.212366,0.122323,0.203871
2013-12-18 00:10:00,195,0.750246,0.081549,0.122323,0.087074,0.242956
2013-12-18 00:20:00,195,2.058255,0.040774,0.122323,0.040774,0.122323
2013-12-18 00:30:00,195,1.113933,0.040774,0.212366,0.081549,0.122323
2013-12-18 00:40:00,195,0.924216,0.040774,0.081549,0.122323,0.040774
...,...,...,...,...,...,...
2014-01-01 23:10:00,4513,11.406033,0.299860,0.164625,0.105123,0.145032
2014-01-01 23:20:00,4513,11.752381,0.169204,0.383782,0.200450,0.095327
2014-01-01 23:30:00,4513,7.240947,0.419606,0.334076,0.095327,0.095327
2014-01-01 23:40:00,4513,11.424141,0.217548,0.259951,0.302113,0.082312


Unnamed: 0_level_0,gridID,SmsIn,SmsOut,CallIn,CallOut,Internet,month,day_of_month,day_of_week,hour,minute
startTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-12-18 00:00:00,195,-0.664013,-0.592025,-0.354883,-0.538771,-0.440707,12,18,2,0,0
2013-12-18 00:00:00,216,-0.430013,-0.503932,-0.500486,-0.452446,-0.433752,12,18,2,0,0
2013-12-18 00:00:00,726,-0.141639,-0.548710,-0.421116,-0.592480,-0.611652,12,18,2,0,0
2013-12-18 00:00:00,794,-0.592913,-0.445224,-0.258781,-0.527422,-0.607618,12,18,2,0,0
2013-12-18 00:00:00,927,-0.143994,-0.536830,-0.421686,-0.592625,-0.611652,12,18,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2014-01-01 23:50:00,596,-0.689519,-0.648918,-0.548800,-0.647548,-0.574144,1,1,2,23,50
2014-01-01 23:50:00,3778,3.103546,0.315203,1.512240,0.181584,-0.458663,1,1,2,23,50
2014-01-01 23:50:00,195,,,,,-0.586340,1,1,2,23,50
2014-01-01 23:50:00,2524,-0.291693,-0.524038,-0.380299,-0.536473,-0.606966,1,1,2,23,50


In [129]:
train_grid_ids
test_grid_ids
data_eval_scaled['gridID'].unique()

array([ 195,  216,  317,  596,  726,  794,  927, 1009, 1367, 1515, 1667,
       1717, 1935, 2002, 2021, 2524, 2570, 3236, 3338, 3541, 3704, 3778,
       3882, 4467, 4513])

In [125]:
train_data = data_scaled[data_scaled['gridID'].isin(train_grid_ids)]
test_data = data_scaled[data_scaled['gridID'].isin(test_grid_ids)]
eval_data = data_eval_scaled

In [126]:
timestamp1 = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [127]:
## Saving
train_data.to_parquet(f'../data/train_data_{timestamp1}.parquet')
test_data.to_parquet(f'../data/test_data_scaled_{timestamp1}.parquet')
eval_data.to_parquet(f'../data/eval_data_scaled_{timestamp1}.parquet')

In [128]:
data_eval_scaled['gridID'].unique()

array([ 195,  216,  726,  794,  927, 1009, 1367, 1515, 1667, 2021, 2524,
       2570, 3338, 3541, 3778, 3882, 1717,  596])