# Web scraping des séries financières sur Yahoo Finance 

Objectif: extraire des séries financières du CAC 40 sur Yahoo Finance afin d'analyser les fluctuations du marché.
Yahoo Finance: https://finance.yahoo.com/quote/%5EFCHI/

1. Extraire la capitalisation boursière des entreprises du CAC 40 selon leurs tickers 

In [26]:
from warnings import filterwarnings
filterwarnings('ignore')

In [27]:
import pandas as pd
import numpy as np

In [32]:
# Extraction de la table CAC40 (4 - numéro d'ordre de la table)
cac40=pd.read_html('https://en.wikipedia.org/wiki/CAC_40')[4] 
cac40.head()

Unnamed: 0,Company,Sector,GICS Sub-Industry,Ticker
0,Air Liquide,Basic Materials,Industrial Gases,AI.PA
1,Airbus,Industrials,Aerospace & Defense,AIR.PA
2,Alstom,Industrials,Rail Transport,ALO.PA
3,ArcelorMittal,Basic Materials,Steel,MT.AS
4,Axa,Financial Services,Life & Health Insurance,CS.PA


In [33]:
tickers=cac40['Ticker'].to_list()
tickers

['AI.PA',
 'AIR.PA',
 'ALO.PA',
 'MT.AS',
 'CS.PA',
 'BNP.PA',
 'EN.PA',
 'CAP.PA',
 'CA.PA',
 'ACA.PA',
 'BN.PA',
 'DSY.PA',
 'EDEN.PA',
 'ENGI.PA',
 'EL.PA',
 'ERF.PA',
 'RMS.PA',
 'KER.PA',
 'OR.PA',
 'LR.PA',
 'MC.PA',
 'ML.PA',
 'ORA.PA',
 'RI.PA',
 'PUB.PA',
 'RNO.PA',
 'SAF.PA',
 'SGO.PA',
 'SAN.PA',
 'SU.PA',
 'GLE.PA',
 'STLAP.PA',
 'STMPA.PA',
 'TEP.PA',
 'HO.PA',
 'TTE.PA',
 'URW.PA',
 'VIE.PA',
 'DG.PA',
 'WLN.PA']

In [34]:
pip install pandas-datareader

Note: you may need to restart the kernel to use updated packages.


In [35]:
pip install yfinance

Note: you may need to restart the kernel to use updated packages.


In [36]:
# Extraction des séries de la capitalisation boursière actiuelle 'marketCap'
from pandas_datareader import DataReader, get_quote_yahoo
import yfinance as yf

market_cap = {}
for ticker in tickers:
    try:
        ticker_info = yf.Ticker(ticker).info
        market_cap[ticker] = ticker_info.get('marketCap', 'N/A')
    except Exception as e:
        market_cap[ticker] = f"Error: {e}"
market_cap_df = pd.DataFrame.from_dict(market_cap, orient='index', columns=['MarketCap'])       
market_cap_df

Unnamed: 0,MarketCap
AI.PA,92968239104
AIR.PA,101443911680
ALO.PA,7240038912
MT.AS,16960429056
CS.PA,69223014400
BNP.PA,67317116928
EN.PA,11296712704
CAP.PA,31547086848
CA.PA,8949190656
ACA.PA,38547161088


In [37]:
# Ajouter une colonne 'Market Capitalization'
cac40['Market Capitalization']=market_cap_df.values
cac40['Market Capitalization']=cac40['Market Capitalization']/1000000000
cac40.set_index('Ticker', inplace=True) # Remplacer les indices par les tickers
cac40.head()

Unnamed: 0_level_0,Company,Sector,GICS Sub-Industry,Market Capitalization
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AI.PA,Air Liquide,Basic Materials,Industrial Gases,92.968239
AIR.PA,Airbus,Industrials,Aerospace & Defense,101.443912
ALO.PA,Alstom,Industrials,Rail Transport,7.240039
MT.AS,ArcelorMittal,Basic Materials,Steel,16.960429
CS.PA,Axa,Financial Services,Life & Health Insurance,69.223014


In [46]:
# Trier par la capitalisation
cac40.sort_values(by='Market Capitalization', ascending=False)

Unnamed: 0_level_0,Company,Sector,GICS Sub-Industry,Market Capitalization
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MC.PA,LVMH,Consumer Cyclical,"Apparel, Accessories & Luxury Goods",356.469572
RMS.PA,Hermès,Consumer Cyclical,"Apparel, Accessories & Luxury Goods",227.009053
OR.PA,L'Oréal,Consumer Defensive,Personal Products,219.263975
TTE.PA,TotalEnergies,Energy,Integrated Oil & Gas,143.730491
SU.PA,Schneider Electric,Industrials,Electrical Components & Equipment,125.778698
SAN.PA,Sanofi,Healthcare,Pharmaceuticals,112.517644
AIR.PA,Airbus,Industrials,Aerospace & Defense,101.443912
AI.PA,Air Liquide,Basic Materials,Industrial Gases,92.968239
EL.PA,EssilorLuxottica,Healthcare,"Apparel, Accessories & Luxury Goods",91.658469
SAF.PA,Safran,Industrials,Aerospace & Defense,81.161994


In [42]:
# Quelle est la plus grande entreprise par secteur (sub-industry)?
composantes=cac40.groupby('Sector')['Market Capitalization'].nlargest(1)
composantes = composantes.reset_index()
composantes.sort_values(by='Market Capitalization', ascending=False)

Unnamed: 0,Sector,Ticker,Market Capitalization
2,Consumer Cyclical,MC.PA,356.469572
3,Consumer Defensive,OR.PA,219.263975
4,Energy,TTE.PA,143.730491
7,Industrials,SU.PA,125.778698
6,Healthcare,SAN.PA,112.517644
0,Basic Materials,AI.PA,92.968239
5,Financial Services,CS.PA,69.223014
9,Technology,DSY.PA,46.219702
10,Utilities,ENGI.PA,32.348443
1,Communication Services,ORA.PA,24.869667


In [47]:
import os
os.chdir(r"/Users/patash/Documents/Github")

In [49]:
# Exporter cac40 en fichier CSV
cac40.to_csv('market_capitalization.csv')

In [50]:
symbols=composantes['Ticker'].to_list()
symbols

['AI.PA',
 'ORA.PA',
 'MC.PA',
 'OR.PA',
 'TTE.PA',
 'CS.PA',
 'SAN.PA',
 'SU.PA',
 'URW.PA',
 'DSY.PA',
 'ENGI.PA']

In [51]:
# Extraction des séries temporelles des cours des actions (Adj Close, Volume, etc)
# !!! Vérifier les données de chaque entreprise quand on supprime les NULLS
stocks_prices = yf.download(tickers, start="2000-01-01")['Adj Close']
stocks_prices
# stocks_prices.dropna(inplace=True)

[*********************100%%**********************]  40 of 40 completed


Ticker,ACA.PA,AI.PA,AIR.PA,ALO.PA,BN.PA,BNP.PA,CA.PA,CAP.PA,CS.PA,DG.PA,...,SAN.PA,SGO.PA,STLAP.PA,STMPA.PA,SU.PA,TEP.PA,TTE.PA,URW.PA,VIE.PA,WLN.PA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-03,,16.686274,,,14.192828,13.191471,47.607185,155.200439,5.963917,4.683844,...,17.602266,23.122625,,,19.410328,14.736809,10.012562,,,
2000-01-04,,15.945753,,,13.578416,12.820683,44.597694,143.859070,5.744657,4.412099,...,16.507599,22.691317,,,18.722368,15.428928,9.648470,,,
2000-01-05,,15.797651,,,13.516971,12.549725,42.496223,129.667252,5.656950,4.512372,...,15.811394,22.415770,,,17.936129,14.814575,9.405739,,,
2000-01-06,,16.933104,,,14.500033,12.121893,41.769798,130.516312,5.613098,4.662786,...,16.420025,22.319921,,,18.845215,15.553359,9.132668,,,
2000-01-07,,16.725761,,,14.561472,12.278764,43.585880,137.066422,5.788507,4.813196,...,17.295761,21.996439,,,19.189198,15.786659,9.178184,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-24,13.285,164.759995,148.779999,15.675,58.860001,61.270000,13.805000,187.050003,30.900000,102.900002,...,90.360001,73.160004,19.374001,37.759998,227.149994,100.099998,62.509998,74.599998,29.160000,10.235
2024-06-25,13.140,166.520004,134.779999,15.460,58.160000,60.560001,13.800000,189.500000,30.760000,101.500000,...,90.870003,73.320000,19.358000,37.224998,226.449997,98.680000,62.880001,74.400002,28.750000,10.265
2024-06-26,12.980,164.820007,130.979996,15.330,58.080002,60.250000,13.555000,188.550003,30.660000,101.150002,...,90.790001,73.419998,19.332001,36.840000,227.199997,100.699997,62.040001,72.699997,28.650000,10.045
2024-06-27,12.930,163.479996,130.639999,15.505,57.240002,60.020000,13.380000,187.949997,30.370001,98.900002,...,89.860001,73.199997,18.556000,36.314999,226.300003,101.400002,62.040001,73.540001,28.110001,10.245


In [52]:
# Exporter stocks_prices en fichier CSV
stocks_prices.to_csv('stocks_prices.csv')

In [53]:
# Calculer les rendements hebdomadaires des actions - 5 jours
period=5
rendements=stocks_prices.shift(-period).pct_change(period)
rendements
# rendements.dropna(inplace=True)

Ticker,ACA.PA,AI.PA,AIR.PA,ALO.PA,BN.PA,BNP.PA,CA.PA,CAP.PA,CS.PA,DG.PA,...,SAN.PA,SGO.PA,STLAP.PA,STMPA.PA,SU.PA,TEP.PA,TTE.PA,URW.PA,VIE.PA,WLN.PA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-03,,,,,,,,,,,...,,,,,,,,,,
2000-01-04,,,,,,,,,,,...,,,,,,,,,,
2000-01-05,,,,,,,,,,,...,,,,,,,,,,
2000-01-06,,,,,,,,,,,...,,,,,,,,,,
2000-01-07,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-24,-0.041024,-0.021122,-0.137922,0.001595,-0.030241,-0.028399,-0.043825,-0.007217,-0.010680,-0.043926,...,-0.004648,-0.007381,-0.046351,-0.023702,-0.012547,-0.017582,-0.002879,-0.014477,-0.042524,-0.011236
2024-06-25,-0.030441,-0.031468,-0.048375,0.015524,-0.018569,-0.017008,-0.043478,-0.020053,-0.006177,-0.030739,...,-0.010234,-0.009547,-0.045563,-0.009671,-0.009494,-0.003446,-0.008747,-0.011828,-0.028870,-0.014126
2024-06-26,-0.018490,-0.021478,-0.020767,0.024136,-0.017218,-0.011950,-0.026190,-0.015115,-0.002935,-0.027385,...,-0.009362,-0.010896,-0.044279,0.000679,-0.012764,-0.023436,0.004674,0.011279,-0.025480,0.007466
2024-06-27,-0.014695,-0.013457,-0.018218,0.012577,-0.002795,-0.008164,-0.013453,-0.011971,0.006585,-0.005258,...,0.000890,-0.007923,-0.004311,0.015145,-0.008838,-0.030178,0.004674,-0.000272,-0.006759,-0.012201


In [54]:
# Exporter rendements en fichier CSV
rendements.to_csv('rendements_5d.csv')

In [61]:
# Moyennes mobiles de 5, 14, 30, 50 et 200 jours LVMH
SMA_EMA =pd.DataFrame()
for n in [5, 14, 30, 50, 200]:
    
    # Création des indicateurs de moyenne mobile et division par le prix de clôture
    SMA_EMA['sma'+ str(n)] = (stocks_prices['MC.PA'].rolling(window=n).mean())/stocks_prices['MC.PA']

    # Création des indicateurs de moyenne mobile exponentielle et division par le prix de clôture
    SMA_EMA['ema'+ str(n)] = pd.Series(stocks_prices['MC.PA'].ewm(span=n, min_periods=n).mean())/stocks_prices['MC.PA']
    
SMA_EMA

Unnamed: 0_level_0,sma5,ema5,sma14,ema14,sma30,ema30,sma50,ema50,sma200,ema200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000-01-03,,,,,,,,,,
2000-01-04,,,,,,,,,,
2000-01-05,,,,,,,,,,
2000-01-06,,,,,,,,,,
2000-01-07,1.042917,1.019135,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
2024-06-24,0.984033,0.990263,1.005889,1.000642,1.025510,1.021326,1.047997,1.038585,1.022571,1.047085
2024-06-25,0.977524,0.984280,0.990296,0.988433,1.008682,1.006594,1.031975,1.023122,1.008326,1.032125
2024-06-26,0.996204,0.998366,1.000138,1.001523,1.019436,1.018862,1.044581,1.035466,1.021955,1.045580
2024-06-27,1.012746,1.009608,1.011520,1.015271,1.032537,1.032965,1.059108,1.050065,1.038369,1.061764


In [57]:
pip install ta

Note: you may need to restart the kernel to use updated packages.


In [62]:
import ta
# RSI LVMH (indice de force relative: < 30 - survente, > 70 -surachat) de 5, 14, 30, 50 et 200 jours
# 100 - (100/(1+H/B))

RSI = pd.DataFrame()
periods = [5, 14, 30, 50, 200]
for n in periods:
    RSI['RSI' + str(n)] = ta.momentum.RSIIndicator(stocks_prices['MC.PA'], window=n).rsi()
RSI

Unnamed: 0_level_0,RSI5,RSI14,RSI30,RSI50,RSI200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-03,,,,,
2000-01-04,,,,,
2000-01-05,,,,,
2000-01-06,,,,,
2000-01-07,0.000000,,,,
...,...,...,...,...,...
2024-06-24,58.003614,43.270920,42.818846,45.158911,49.543902
2024-06-25,69.229341,48.752588,45.394586,46.612511,49.849726
2024-06-26,52.529724,44.360446,43.467395,45.443094,49.562465
2024-06-27,38.800735,39.826205,41.337023,44.117756,49.227905


In [55]:
# Extraction les volumes des actions négociées 
volume = yf.download(tickers, start="2000-01-01")['Volume']
volume

[*********************100%%**********************]  40 of 40 completed


Ticker,ACA.PA,AI.PA,AIR.PA,ALO.PA,BN.PA,BNP.PA,CA.PA,CAP.PA,CS.PA,DG.PA,...,SAN.PA,SGO.PA,STLAP.PA,STMPA.PA,SU.PA,TEP.PA,TTE.PA,URW.PA,VIE.PA,WLN.PA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-03,,1094181.0,,,3382058.0,3712665.0,757948.0,492603.0,12220955.0,235369.0,...,710248.0,1163036.0,,,961380.0,334079.0,4044084.0,,,
2000-01-04,,1671548.0,,,6230117.0,6404755.0,1799994.0,357912.0,25162210.0,159096.0,...,1102290.0,1239792.0,,,908412.0,512119.0,6565240.0,,,
2000-01-05,,1033053.0,,,5252492.0,4837347.0,2222860.0,784213.0,20716144.0,535013.0,...,1457955.0,1275800.0,,,960046.0,403416.0,8296864.0,,,
2000-01-06,,1678435.0,,,9245939.0,6594478.0,2263606.0,468768.0,10870492.0,215557.0,...,1054723.0,735768.0,,,1052494.0,320457.0,9581504.0,,,
2000-01-07,,2659862.0,,,8866494.0,5929662.0,1487008.0,550392.0,15711483.0,373859.0,...,1688898.0,933268.0,,,2573564.0,980853.0,16985068.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-24,4914558.0,610195.0,901064.0,1232988.0,1359600.0,3799351.0,1916231.0,289760.0,3613473.0,1025810.0,...,2067170.0,883406.0,2040183.0,1253631.0,480438.0,243350.0,2740180.0,290436.0,2202288.0,1180534.0
2024-06-25,4851276.0,915260.0,8026551.0,1201500.0,1365709.0,3411239.0,1732004.0,333200.0,3363255.0,866266.0,...,1526460.0,877685.0,1828186.0,1775139.0,804610.0,179905.0,2926768.0,398463.0,2212099.0,926229.0
2024-06-26,6770937.0,714299.0,3087889.0,1130474.0,1140544.0,3252961.0,2556248.0,354198.0,3059712.0,969193.0,...,2291841.0,1093329.0,2353982.0,1317316.0,762912.0,217760.0,3708769.0,392584.0,1823532.0,1187657.0
2024-06-27,4361387.0,453781.0,1535124.0,1709881.0,1313573.0,3437993.0,2180942.0,315071.0,5676440.0,1404021.0,...,1522407.0,713231.0,3346965.0,1480567.0,709380.0,377760.0,2275177.0,271918.0,1797673.0,1519856.0


In [59]:
# Exporter volumes en fichier CSV
volume.to_csv('volume.csv')

In [60]:
# Rendements décalés de 5 jours
lags_rendements_5d = pd.concat(
     [stocks_prices[stk].shift(period).pct_change() for stk in tickers],
     axis=1
)
lags_rendements_5d.columns=[i+ '_DT' for i in tickers]
# lags_rendements_5d.dropna(inplace=True)
lags_rendements_5d

Unnamed: 0_level_0,AI.PA_DT,AIR.PA_DT,ALO.PA_DT,MT.AS_DT,CS.PA_DT,BNP.PA_DT,EN.PA_DT,CAP.PA_DT,CA.PA_DT,ACA.PA_DT,...,GLE.PA_DT,STLAP.PA_DT,STMPA.PA_DT,TEP.PA_DT,HO.PA_DT,TTE.PA_DT,URW.PA_DT,VIE.PA_DT,DG.PA_DT,WLN.PA_DT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-03,,,,,,,,,,,...,,,,,,,,,,
2000-01-04,,,,,,,,,,,...,,,,,,,,,,
2000-01-05,,,,,,,,,,,...,,,,,,,,,,
2000-01-06,,,,,,,,,,,...,,,,,,,,,,
2000-01-07,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-24,0.006906,0.013767,0.032570,0.000458,0.018688,0.012498,0.010390,0.002743,0.009372,-0.003836,...,0.011738,0.008718,0.008375,0.036870,0.011643,0.009443,-0.026759,-0.007880,0.001630,-0.006763
2024-06-25,0.011722,0.015912,0.006246,0.011910,0.018346,0.005411,0.008997,0.005197,-0.042641,0.004621,...,0.000892,0.001897,0.008179,0.034732,0.006577,0.013871,0.025510,0.024549,0.022380,0.026265
2024-06-26,-0.005423,-0.001215,-0.005587,-0.004527,-0.001965,-0.012614,-0.011783,-0.002449,0.000718,-0.006516,...,-0.016050,-0.001473,-0.046181,0.011988,0.008494,-0.004672,0.003040,0.006343,-0.000995,-0.027962
2024-06-27,0.018094,0.005543,0.026841,0.018190,0.015425,0.023846,0.016436,0.030278,-0.003230,0.017361,...,0.015406,0.004952,0.005627,0.013820,0.021380,0.008579,0.020116,0.021359,0.019920,-0.013164


In [None]:
# Variable cible - y
# period=5
# y=stocks_prices['MC.PA'].shift(-period).pct_change(period)
# y.name='LVMH_5d_close_future_pct'
# y

In [None]:
# Créer un dataframe X - concatiner toutes les variables
# X = pd.concat([SMA_EMA, RSI, volume, lags_rendements_5d], axis=1)
# X

In [None]:
# clean_dataset = pd.concat([y, X], axis=1).dropna()
# clean_dataset

In [None]:
# days_of_week=pd.get_dummies(clean_dataset.index.dayofweek, prefix='weekday', drop_first=True)
# days_of_week.index = clean_dataset.index
# clean_dataset = pd.concat([clean_dataset, days_of_week], axis=1)
# clean_dataset