In [1]:

import pandas as pd  
import re  
from datetime import datetime 

In [2]:

df = pd.read_csv('btc_historique_5ans.csv')


print("Original data:")
print(df.head())



Original data:
           Date       Open       High        Low      Close  Adj Close  \
0  Dec 13, 2025  90,281.64  90,614.31  89,988.05  90,033.09  90,033.09   
1  Dec 12, 2025  92,513.66  92,747.93  89,532.60  90,270.41  90,270.41   
2  Dec 11, 2025  92,011.30  93,554.27  89,335.30  92,511.34  92,511.34   
3  Dec 10, 2025  92,695.23  94,477.16  91,640.13  92,020.95  92,020.95   
4   Dec 9, 2025  90,639.70  94,601.57  89,586.98  92,691.71  92,691.71   

           Volume   Ticker  
0  64,809,832,448  BTC-USD  
1  80,275,884,583  BTC-USD  
2  64,532,834,621  BTC-USD  
3  65,420,694,513  BTC-USD  
4  66,861,721,440  BTC-USD  


In [None]:
# Affichage des dimensions du DataFrame 
print("Shape: {}".format(df.shape))

Shape: (1825, 8)


In [None]:
# Affichage des types de données de chaque colonne
print("Data types:\n{}".format(df.dtypes))

Data types:
Date         object
Open         object
High         object
Low          object
Close        object
Adj Close    object
Volume       object
Ticker       object
dtype: object


In [None]:
class DataCleaner:
    def clean_numeric(self, value):
        # Vérifie si la valeur est une chaîne de caractères
        if isinstance(value, str):
            # Gère les cas spéciaux : tirets, vides, ou "N/A"
            if value.strip() in ['-', '', 'N/A', 'null']:
                return 0  # Remplace par 0 pour éviter les erreurs

      
            cleaned = re.sub(r',', '', value.strip())
            if cleaned.isnumeric():
                return float(cleaned)
            else:
                return 0
        # Si ce n'est pas une chaîne, retourne la valeur inchangée
        return value

In [None]:
# Affichage des données après nettoyage
print("\nCleaned data:")
print(df.head())

# Affichage des types de données après nettoyage
message = "Data types after cleaning:\n{}".format(df.dtypes)
parts = message.split()
joined = ' '.join(parts)
print(joined)


Cleaned data:
        Date      Open      High       Low     Close  Adj Close       Volume  \
0 2025-12-13  90281.64  90614.31  89988.05  90033.09   90033.09  64809832448   
1 2025-12-12  92513.66  92747.93  89532.60  90270.41   90270.41  80275884583   
2 2025-12-11  92011.30  93554.27  89335.30  92511.34   92511.34  64532834621   
3 2025-12-10  92695.23  94477.16  91640.13  92020.95   92020.95  65420694513   
4 2025-12-09  90639.70  94601.57  89586.98  92691.71   92691.71  66861721440   

    Ticker  
0  BTC-USD  
1  BTC-USD  
2  BTC-USD  
3  BTC-USD  
4  BTC-USD  
Data types after cleaning:
Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
Ticker               object
dtype: object


In [7]:
# Tri des données par date 
df = df.sort_values(['Date']).reset_index(drop=True)

print("\nData sorted by Date ")
print(df.head(10))


Data sorted by Date 
        Date      Open      High       Low     Close  Adj Close       Volume  \
0 2020-12-15  19246.92  19525.01  19079.84  19417.08   19417.08  26741982541   
1 2020-12-16  19418.82  21458.91  19298.32  21310.60   21310.60  44409011479   
2 2020-12-17  21308.35  23642.66  21234.68  22805.16   22805.16  71378606374   
3 2020-12-18  22806.80  23238.60  22399.81  23137.96   23137.96  40387896275   
4 2020-12-19  23132.87  24085.86  22826.47  23869.83   23869.83  38487546580   
5 2020-12-20  23861.77  24209.66  23147.71  23477.29   23477.29  37844228422   
6 2020-12-21  23474.46  24059.98  22159.37  22803.08   22803.08  45852713981   
7 2020-12-22  22794.04  23789.90  22430.61  23783.03   23783.03  44171632681   
8 2020-12-23  23781.97  24024.49  22802.65  23241.35   23241.35  51146161904   
9 2020-12-24  23240.20  23768.34  22777.60  23735.95   23735.95  41080759713   

    Ticker  
0  BTC-USD  
1  BTC-USD  
2  BTC-USD  
3  BTC-USD  
4  BTC-USD  
5  BTC-USD  
6  BTC

In [None]:
df.to_csv('C:\\Users\\sirin\\Desktop\\Cryptocurrency_tracker\\Cryptocurrency_Tracker\\data\\données_historique_cleaned.csv', index=False)
print("\nCleaned data saved ")
print(df.describe())
print("\nVérification des valeurs nulles ou égales à 0:")
print("Nombre de valeurs nulles par colonne:\n{}".format(df.isnull().sum()))
print("\nNombre de zéros par colonne:\n{}".format((df == 0).sum()))


Cleaned data saved 
                      Date           Open           High            Low  \
count                 1825    1825.000000    1825.000000    1825.000000   
mean   2023-06-15 00:00:00   53784.606203   54842.710981   52662.032099   
min    2020-12-15 00:00:00   15782.300000   16253.050000   15599.050000   
25%    2022-03-16 00:00:00   29174.380000   29517.770000   28722.760000   
50%    2023-06-15 00:00:00   45576.880000   46929.050000   44187.760000   
75%    2024-09-13 00:00:00   68243.100000   69398.510000   66758.730000   
max    2025-12-13 00:00:00  124752.140000  126198.070000  123196.050000   
std                    NaN   29422.589832   29827.731487   28980.747337   

               Close      Adj Close        Volume  
count    1825.000000    1825.000000  1.825000e+03  
mean    53821.164438   53821.164438  3.728333e+10  
min     15787.280000   15787.280000  5.331173e+09  
25%     29178.680000   29178.680000  2.142395e+10  
50%     45593.640000   45593.640000  3.1962

In [None]:
# === FEATURE ENGINEERING 

# On décale 'Close' de -1 vers le haut.
df['Target_Next_Close'] = df['Close'].shift(-1)

# 2. Création des Features 
df['MA_7'] = df['Close'].rolling(window=7).apply(lambda window: window.mean())   
df['MA_30'] = df['Close'].rolling(window=30).apply(lambda window: window.mean()) 

# Variation en %
df['Daily_Return'] = df['Close'].pct_change()

# Volatilité (Écart-type sur 7 jours)
df['Volatility_7d'] = df['Close'].rolling(window=7).apply(lambda window: window.std())

# Lag Features 
df['Lag_1'] = df['Close'].shift(1) 
df['Lag_2'] = df['Close'].shift(2)

df_model = df.dropna().copy()

print("Aperçu des données préparées pour le modèle :")
print(df_model[['Date', 'Close', 'Target_Next_Close', 'MA_7', 'Daily_Return']].head())
print("\nDimensions finales : {}".format(df_model.shape))

# Sauvegarde du fichier prêt pour le Machine Learning
output_model_path = 'crypto_data_model_ready.csv'
df_model.to_csv(output_model_path, index=False)
print("Données prêtes pour le modèle sauvegardées dans : {}".format(output_model_path))

Aperçu des données préparées pour le modèle :
         Date     Close  Target_Next_Close          MA_7  Daily_Return
29 2021-01-13  37316.36           39187.33  37940.802857      0.100033
30 2021-01-14  39187.33           36825.37  37914.558571      0.050138
31 2021-01-15  36825.37           36178.14  37347.095714     -0.060274
32 2021-01-16  36178.14           35791.28  36764.751429     -0.017576
33 2021-01-17  35791.28           36630.07  36398.300000     -0.010693

Dimensions finales : (1795, 15)
Données prêtes pour le modèle sauvegardées dans : crypto_data_model_ready.csv
