In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
datas = "../dataset/true_datas.csv"
datas = pd.read_csv(datas)
datas = datas.drop(columns=['Exit','Entry'])

In [3]:
datas.dtypes, len(datas)

(Days                                 float64
 Inverted                             float64
 PerfMean                             float64
 PerfStd                              float64
 EntryLoad                            float64
 ExitLoad                             float64
 EntryDemand                          float64
 ExitDemand                           float64
 Distance                             float64
 ContainerQuantity                    float64
 Stage                                  int64
 Entry_Working_Day                      int64
 Exit_Working_Day                       int64
 Entry_precipitation_sum              float64
 Entry_rain_sum                       float64
 Entry_snowfall_sum                   float64
 Entry_wind_speed_10m_max             float64
 Entry_wind_gusts_10m_max             float64
 Entry_wind_direction_10m_dominant    float64
 Exit_precipitation_sum               float64
 Exit_rain_sum                        float64
 Exit_snowfall_sum                

In [4]:
# Function to perform data augmentation
def distribution_percentage(x):
    if x == 0:
        return x
    else:
        # Add a random value from a specified distribution
        return x*(1+abs(np.random.normal(0, 0.1)))

def weather_rain_distribution_percentage(x):
    r = random.random()
    if x == 0 and r<0.90:
        return x
    elif x == 0 and r>0.90:
        return np.random.normal(3.5, 7)
    else:
        # Add a random value from a specified distribution
        return x*(1+abs(np.random.normal(0, 0.1)))


def weather_snow_distribution_percentage(x):
    if x == 0:
        return x
    else:
        # Add a random value from a specified distribution
        return x*(1+abs(np.random.normal(0, 5)))


def augment_data(df, augmentation_factor=1):
    augmented_data = []
    for _ in range(int(augmentation_factor-1)):
        # Apply transformation (for example, adding random noise)
        augmented_Days = df['Days'] + abs(np.random.normal(0, 0.1, len(df))) + abs(np.random.poisson(lam=0.075, size=len(df)))
        augmented_Inverted = df['Inverted']
        augmented_PerfMean = df['PerfMean'].apply(distribution_percentage)
        augmented_PerfStd = df['PerfStd'].apply(distribution_percentage) #Im supposed to compute them but no time
        augmented_EntryLoad = df['EntryLoad']
        augmented_ExitLoad = df['ExitLoad']
        augmented_EntryDemand = df['EntryDemand']
        augmented_ExitDemand = df['ExitDemand']
        augmented_Distance = df['Distance'].apply(distribution_percentage)
        augmented_ContainerQuantity =df['ContainerQuantity']
        augmented_Stage = df['Stage']
        augmented_Entry_Working_Day = df['Entry_Working_Day']
        augmented_Exit_Working_Day = df['Exit_Working_Day']
        augmented_Entry_rain_sum = df['Entry_rain_sum'].apply(weather_rain_distribution_percentage)
        augmented_Exit_rain_sum = df['Exit_rain_sum'].apply(weather_rain_distribution_percentage)
        augmented_Entry_snowfall_sum = df['Entry_snowfall_sum'].apply(weather_snow_distribution_percentage)
        augmented_Exit_snowfall_sum = df['Exit_snowfall_sum'].apply(weather_snow_distribution_percentage)
        augmented_Entry_precipitation_sum = augmented_Entry_rain_sum + augmented_Entry_snowfall_sum
        augmented_Exit_precipitation_sum = augmented_Exit_rain_sum + augmented_Exit_snowfall_sum
        augmented_Entry_wind_speed_10m_max = df['Entry_wind_speed_10m_max'] + np.random.normal(0, 0.6, len(df))
        augmented_Entry_wind_gusts_10m_max = df['Entry_wind_gusts_10m_max'] + np.random.normal(0, 1.2, len(df))
        augmented_Entry_wind_direction_10m_dominant = df['Entry_wind_direction_10m_dominant'] + np.random.normal(0, 10, len(df))
        augmented_Exit_wind_speed_10m_max = df['Exit_wind_speed_10m_max'] + np.random.normal(0, 0.6, len(df))
        augmented_Exit_wind_gusts_10m_max = df['Exit_wind_gusts_10m_max'] + np.random.normal(0, 1.2, len(df))
        augmented_Exit_wind_direction_10m_dominant = df['Exit_wind_direction_10m_dominant'] + np.random.normal(0, 10, len(df))
        
        # print(augmented_Days)
        
        # Create a new DataFrame with augmented data
        augmented_df = pd.DataFrame({
            'Days': augmented_Days,
            'Inverted': augmented_Inverted,
            'PerfMean': augmented_PerfMean,
            'PerfStd': augmented_PerfStd,
            'Distance': augmented_Distance,
            'EntryLoad': augmented_EntryLoad,
            'ExitLoad': augmented_ExitLoad,
            'EntryDemand': augmented_EntryDemand,
            'ExitDemand': augmented_ExitDemand,
            'ContainerQuantity': augmented_ContainerQuantity,
            'Stage': augmented_Stage,
            'Entry_Working_Day': augmented_Entry_Working_Day,
            'Exit_Working_Day': augmented_Exit_Working_Day,
            'Entry_precipitation_sum': augmented_Entry_precipitation_sum,
            'Exit_precipitation_sum': augmented_Exit_precipitation_sum,
            'Entry_snowfall_sum': augmented_Entry_snowfall_sum,
            'Exit_snowfall_sum': augmented_Exit_snowfall_sum,
            'Entry_rain_sum': augmented_Entry_rain_sum,
            'Exit_rain_sum': augmented_Exit_rain_sum,
            'Entry_wind_speed_10m_max': augmented_Entry_wind_speed_10m_max,
            'Entry_wind_gusts_10m_max': augmented_Entry_wind_gusts_10m_max,
            'Entry_wind_direction_10m_dominant': augmented_Entry_wind_direction_10m_dominant,
            'Exit_wind_speed_10m_max': augmented_Exit_wind_speed_10m_max,
            'Exit_wind_gusts_10m_max': augmented_Exit_wind_gusts_10m_max,
            'Exit_wind_direction_10m_dominant': augmented_Exit_wind_direction_10m_dominant
        })
        
        augmented_data.append(augmented_df)
    
    
    # Concatenate the original DataFrame with the augmented data
    augmented_df = pd.concat([df] + augmented_data, ignore_index=True)
    return augmented_df

# Number of times to augment the data
augmentation_factor = 2 # int

# Perform data augmentation
augmented_df = augment_data(datas, augmentation_factor)

print(len(augmented_df))
augmented_df[:20]


510


Unnamed: 0,Days,Inverted,PerfMean,PerfStd,EntryLoad,ExitLoad,EntryDemand,ExitDemand,Distance,ContainerQuantity,...,Entry_snowfall_sum,Entry_wind_speed_10m_max,Entry_wind_gusts_10m_max,Entry_wind_direction_10m_dominant,Exit_precipitation_sum,Exit_rain_sum,Exit_snowfall_sum,Exit_wind_speed_10m_max,Exit_wind_gusts_10m_max,Exit_wind_direction_10m_dominant
0,15.0,0.0,24.0,7.348469,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,26.980793,63.0,247.3075,0.0,0.0,0.0,21.897945,42.12,343.37125
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.7,2.0,...,0.0,21.897945,42.12,343.37125,0.0,0.0,0.0,21.897945,42.12,343.37125
2,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,21.897945,42.12,343.37125,3.7,3.7,0.0,19.174856,36.36,242.8069
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.42,2.0,...,0.0,19.174856,36.36,242.8069,3.7,3.7,0.0,19.174856,36.36,242.8069
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,19.174856,36.36,242.8069,17.6,17.6,0.0,21.096123,39.96,251.1156
5,11.0,0.0,10.375,0.599479,0.0,2.0,0.0,2.0,6698.14,2.0,...,0.0,21.096123,39.96,251.1156,0.0,0.0,0.0,12.144331,24.84,351.42484
6,2.0,0.0,2.0,0.0,2.0,2.0,2.0,2.0,0.0,2.0,...,0.0,12.144331,24.84,351.42484,2.0,2.0,0.0,16.73516,33.12,280.13187
7,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,15.06,2.0,...,0.0,16.73516,33.12,280.13187,2.0,2.0,0.0,16.73516,33.12,280.13187
8,15.0,0.0,15.9375,3.630922,2.0,2.0,2.0,2.0,0.0,2.0,...,0.0,16.73516,33.12,280.13187,0.6,0.6,0.0,18.416384,38.519997,253.16681
9,0.0,1.0,0.0,0.0,2.0,2.0,2.0,2.0,15.06,2.0,...,0.0,18.416384,38.519997,253.16681,0.6,0.6,0.0,18.416384,38.519997,253.16681


In [5]:
augmented_df[255:276]

Unnamed: 0,Days,Inverted,PerfMean,PerfStd,EntryLoad,ExitLoad,EntryDemand,ExitDemand,Distance,ContainerQuantity,...,Entry_snowfall_sum,Entry_wind_speed_10m_max,Entry_wind_gusts_10m_max,Entry_wind_direction_10m_dominant,Exit_precipitation_sum,Exit_rain_sum,Exit_snowfall_sum,Exit_wind_speed_10m_max,Exit_wind_gusts_10m_max,Exit_wind_direction_10m_dominant
255,15.138795,0.0,25.671582,7.947787,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,26.954283,64.118754,255.947951,0.0,0.0,0.0,21.129324,45.158582,351.711143
256,0.093348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.884164,2.0,...,0.0,21.675189,44.792613,345.037788,0.0,0.0,0.0,20.921226,43.265916,304.013891
257,5.07714,0.0,5.584062,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,21.522002,42.306644,356.265319,3.709581,3.709581,0.0,19.102522,34.591205,230.118527
258,0.01635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.68658,2.0,...,0.0,19.126955,34.415329,259.963818,3.748324,3.748324,0.0,18.518545,36.848461,247.053913
259,1.066308,0.0,1.196895,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,18.940808,35.844556,240.937531,19.134801,19.134801,0.0,20.818795,38.462565,246.820986
260,11.114215,0.0,11.541454,0.660173,0.0,2.0,0.0,2.0,7173.645428,2.0,...,0.0,21.036083,40.06865,238.348076,0.0,0.0,0.0,12.196349,23.190721,349.436539
261,2.025321,0.0,2.124897,0.0,2.0,2.0,2.0,2.0,0.0,2.0,...,0.0,13.018672,23.203644,336.985157,2.131138,2.131138,0.0,17.101421,35.133594,277.044845
262,0.097119,0.0,0.0,0.0,2.0,2.0,2.0,2.0,15.559424,2.0,...,0.0,16.483184,31.190554,292.042834,2.009079,2.009079,0.0,16.683943,32.808037,271.614803
263,15.152309,0.0,16.437378,3.662292,2.0,2.0,2.0,2.0,0.0,2.0,...,0.0,16.428174,33.895305,278.537939,0.618772,0.618772,0.0,17.559058,37.041505,236.814567
264,1.194671,1.0,0.0,0.0,2.0,2.0,2.0,2.0,17.745752,2.0,...,0.0,18.246998,38.651988,248.490823,0.692927,0.692927,0.0,17.903945,39.070968,242.90742


In [6]:
datas['Entry_wind_direction_10m_dominant'].describe()

count    255.000000
mean     191.643722
std       99.753448
min        3.212133
25%      121.333197
50%      208.099290
75%      269.268275
max      357.857060
Name: Entry_wind_direction_10m_dominant, dtype: float64

In [7]:
datas['Entry_wind_direction_10m_dominant']

0      247.30750
1      343.37125
2      343.37125
3      242.80690
4      242.80690
         ...    
250    173.40240
251    173.40240
252    173.40240
253    173.40240
254    173.40240
Name: Entry_wind_direction_10m_dominant, Length: 255, dtype: float64

In [8]:
augmented_df.to_csv('augmented_data.csv', index=False)