In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [10]:
data = pd.read_csv('./SynopDateEdit.csv', sep=';')

data.head(5)

data.drop(['Direction du vent moyen 10 mn',
           'Point de rosée','Direction du vent moyen 10 mn','Précipitations dans les 3 dernières heures',
           'Précipitations dans les 6 dernières heures','Précipitations dans les 12 dernières heures','Précipitations dans la dernière heure','Pression au niveau mer','Variation de pression en 3 heures','mois_de_l_annee'] , axis=1, inplace=True)

#supprimer les lignes des données outre mer
data = data[data['Latitude'] < 50]
data = data[data['Longitude'] > -10]
data = data[data['Longitude'] < 10]

#supprimer les valeurs aberrantes
data = data[data['Température (°C)'] < 50]
data = data[data['Température (°C)'] > -50]

data.drop(['Température'], axis=1, inplace=True)

data = data.sort_values(['ID OMM station', 'Date'])

#store the station column
stationColumns = data['ID OMM station']

#drop the station column
data = data.drop(['ID OMM station'], axis=1)

for col in data.columns:
    # if some columns dont have values in 90% of the rows, we drop them
    if data[col].count() < 0.9 * len(data):
        data.drop(col, axis=1, inplace=True)


#dataframe format : yyyy-mm-ddThh:mm:ss+hh:mm
data['Date'] = pd.to_datetime(data['Date'])


data = data.select_dtypes(exclude='object')



#supprimer les lignes avec des valeurs manquantes
data = data.dropna()





data.head()

Unnamed: 0,Date,Type de tendance barométrique,Vitesse du vent moyen 10 mn,Humidité,Visibilité horizontale,Temps présent,Pression station,Variation de pression en 24 heures,Rafale sur les 10 dernières minutes,Rafales sur une période,Periode de mesure de la rafale,Précipitations dans les 24 dernières heures,Température (°C),Latitude,Longitude,Altitude,EPCI (code),region (code)
12083,2022-01-01 07:00:00,8.0,11.0,83.0,12000.0,3.0,101700.0,170.0,15.8,18.5,-10.0,0.2,12.6,49.725167,-1.939833,6,200067205.0,28.0
102312,2022-01-01 10:00:00,3.0,10.8,78.0,12000.0,2.0,101760.0,30.0,15.4,16.0,-10.0,0.0,12.6,49.725167,-1.939833,6,200067205.0,28.0
161767,2022-01-01 13:00:00,8.0,9.7,80.0,12000.0,3.0,101750.0,-140.0,14.4,15.5,-10.0,0.0,12.9,49.725167,-1.939833,6,200067205.0,28.0
81395,2022-01-01 16:00:00,6.0,8.4,83.0,12000.0,3.0,101670.0,-290.0,12.0,15.6,-10.0,0.0,13.4,49.725167,-1.939833,6,200067205.0,28.0
160692,2022-01-01 19:00:00,8.0,6.6,90.0,12000.0,3.0,101640.0,-420.0,9.3,15.1,-10.0,0.6,12.5,49.725167,-1.939833,6,200067205.0,28.0


In [11]:
#supression de données 
data.drop(['region (code)','EPCI (code)','Visibilité horizontale'], axis=1, inplace=True)
data.drop(['Temps présent','Rafales sur une période','Periode de mesure de la rafale','Pression station'], axis=1, inplace=True)
data.head()
#note : drop la tendance barométrique ou la variation de pression

Unnamed: 0,Date,Type de tendance barométrique,Vitesse du vent moyen 10 mn,Humidité,Variation de pression en 24 heures,Rafale sur les 10 dernières minutes,Précipitations dans les 24 dernières heures,Température (°C),Latitude,Longitude,Altitude
12083,2022-01-01 07:00:00,8.0,11.0,83.0,170.0,15.8,0.2,12.6,49.725167,-1.939833,6
102312,2022-01-01 10:00:00,3.0,10.8,78.0,30.0,15.4,0.0,12.6,49.725167,-1.939833,6
161767,2022-01-01 13:00:00,8.0,9.7,80.0,-140.0,14.4,0.0,12.9,49.725167,-1.939833,6
81395,2022-01-01 16:00:00,6.0,8.4,83.0,-290.0,12.0,0.0,13.4,49.725167,-1.939833,6
160692,2022-01-01 19:00:00,8.0,6.6,90.0,-420.0,9.3,0.6,12.5,49.725167,-1.939833,6


In [12]:
#see types of data
data.describe()

Unnamed: 0,Type de tendance barométrique,Vitesse du vent moyen 10 mn,Humidité,Variation de pression en 24 heures,Rafale sur les 10 dernières minutes,Précipitations dans les 24 dernières heures,Température (°C),Latitude,Longitude,Altitude
count,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0
mean,4.327561,3.532488,72.97731,-1.789905,5.660549,1.549897,14.060343,46.144702,2.429671,171.554089
std,2.711322,2.352273,19.313641,508.13642,3.632207,4.602568,7.976007,2.294073,3.33921,188.279077
min,0.0,0.0,1.0,-2490.0,0.0,-0.1,-12.5,41.918,-4.412,2.0
25%,2.0,1.9,60.0,-300.0,3.0,0.0,8.5,43.909833,0.0,42.0
50%,4.0,3.0,77.0,-10.0,4.9,0.0,13.7,46.593833,2.359833,112.0
75%,7.0,4.7,89.0,280.0,7.5,0.6,19.3,48.444167,5.077833,235.0
max,8.0,23.3,100.0,2690.0,35.7,115.4,41.8,49.725167,9.485167,833.0


In [13]:
data2 = data.copy()

#turn 1st date into 1 and count the number of days since
#add hours as decimals
data['Date'] = data['Date'] - data['Date'].min()

data['Date'] = data['Date'].dt.days + data['Date'].dt.components['hours']/24



#normalize the date
# data = (data - data.min()) / (data.max() - data.min())

#normalize every row except the date
data[data.columns[1:]] = (data[data.columns[1:]] - data[data.columns[1:]].min()) / (data[data.columns[1:]].max() - data[data.columns[1:]].min())

#sort by date
data = data.sort_values(['Date'])

#limit amount of decimals ?
data = data.round(5)
data.describe()
data.head(20)

Unnamed: 0,Date,Type de tendance barométrique,Vitesse du vent moyen 10 mn,Humidité,Variation de pression en 24 heures,Rafale sur les 10 dernières minutes,Précipitations dans les 24 dernières heures,Température (°C),Latitude,Longitude,Altitude
13352,0.0,1.0,0.02575,0.9697,0.53089,0.02801,0.00087,0.33517,0.21813,0.41669,0.1793
43562,0.0,0.875,0.09442,0.9697,0.54826,0.09524,0.00087,0.31123,0.25513,0.28148,0.06859
111589,0.0,1.0,0.18884,0.52525,0.54054,0.18768,0.00087,0.41621,0.16267,0.31747,0.43081
155027,0.0,0.375,0.03433,0.93939,0.54826,0.03922,0.0026,0.32781,0.84941,0.86725,0.1781
160687,0.0,1.0,0.07725,0.9697,0.52896,0.10364,0.00087,0.30939,0.72963,0.85787,0.31408
48733,0.0,1.0,0.05579,0.78788,0.55405,0.06723,0.00087,0.32965,0.49555,0.54409,0.39591
138493,0.0,1.0,0.07725,0.9798,0.55985,0.08964,0.00087,0.44567,0.83609,0.3254,0.16968
102303,0.0,1.0,0.103,0.9899,0.5695,0.07843,0.00087,0.33149,0.65852,0.48728,0.19134
88772,0.0,1.0,0.04292,0.9899,0.53861,0.04202,0.00433,0.35727,0.2125,0.60265,0.0
88771,0.0,1.0,0.12446,0.63636,0.53861,0.10644,0.00087,0.38674,0.28186,0.53475,0.85439


In [14]:
data.describe()

Unnamed: 0,Date,Type de tendance barométrique,Vitesse du vent moyen 10 mn,Humidité,Variation de pression en 24 heures,Rafale sur les 10 dernières minutes,Précipitations dans les 24 dernières heures,Température (°C),Latitude,Longitude,Altitude
count,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0
mean,181.758325,0.540945,0.151609,0.727043,0.480349,0.158559,0.014287,0.489141,0.541388,0.492306,0.204036
std,105.30195,0.338915,0.100956,0.195087,0.098096,0.101743,0.039848,0.146888,0.293842,0.24028,0.22657
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,90.375,0.25,0.08155,0.59596,0.42278,0.08403,0.00087,0.38674,0.25513,0.31747,0.04813
50%,181.25,0.5,0.12876,0.76768,0.47876,0.13725,0.00087,0.4825,0.59892,0.48728,0.13237
75%,273.125,0.875,0.20172,0.88889,0.53475,0.21008,0.00606,0.58564,0.83592,0.68286,0.28039
max,364.875,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
#restore the station column
data['ID OMM station'] = stationColumns

#export to csv
data.to_csv('NormalizedWeatherDataS.csv', index=False, sep=';')

essayer https://developers.google.com/machine-learning/data-prep/transform/normalization?hl=fr
- scaling à une plage
- rognage
- scaling du journal
- Score Z

In [18]:
data2 = data.copy()
# #check if the repartition of the data is gaussian with a histogram
# for col in data2.columns[1:]:
#     data2[col].hist()
#     plt.title(col)
#     plt.show()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
data_scaled = scaler.fit_transform(data2)
data2.describe()


Unnamed: 0,Date,Type de tendance barométrique,Vitesse du vent moyen 10 mn,Humidité,Variation de pression en 24 heures,Rafale sur les 10 dernières minutes,Précipitations dans les 24 dernières heures,Température (°C),Latitude,Longitude,Altitude,ID OMM station
count,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0,100659.0
mean,181.758325,0.540945,0.151609,0.727043,0.480349,0.158559,0.014287,0.489141,0.541388,0.492306,0.204036,7379.135497
std,105.30195,0.338915,0.100956,0.195087,0.098096,0.101743,0.039848,0.146888,0.293842,0.24028,0.22657,233.636534
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7020.0
25%,90.375,0.25,0.08155,0.59596,0.42278,0.08403,0.00087,0.38674,0.25513,0.31747,0.04813,7168.0
50%,181.25,0.5,0.12876,0.76768,0.47876,0.13725,0.00087,0.4825,0.59892,0.48728,0.13237,7335.0
75%,273.125,0.875,0.20172,0.88889,0.53475,0.21008,0.00606,0.58564,0.83592,0.68286,0.28039,7607.0
max,364.875,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7790.0
