# 📊 Extraction de Données en Ligne
Ce notebook contient les scripts d'extraction de données depuis différentes plateformes :
- Amazon (BeautifulSoup)
- Twitter (API v2)
- Instagram (Graph API)
- YouTube (Data API)
- Google Search (`googlesearch`)
- Reddit (PRAW)
- Wikipedia (API Wikipedia)


In [None]:
# 📦 Installation des bibliothèques (exécutez si nécessaire)
!pip install requests beautifulsoup4 tweepy google-api-python-client googlesearch-python praw wikipedia pandas

In [None]:
# 1. Amazon Web Scraping
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.amazon.fr/s?k=ordinateur+portable"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

products = []
for item in soup.select(".s-main-slot .s-result-item"):
    title = item.select_one("h2 span")
    price = item.select_one(".a-price .a-offscreen")
    if title and price:
        products.append({"Produit": title.text, "Prix": price.text})

df_amazon = pd.DataFrame(products)
df_amazon.head()

In [None]:
# 2. Twitter API v2
import tweepy

client = tweepy.Client(bearer_token="VOTRE_BEARER_TOKEN")  # Remplace avec ta clé
query = "Data Science"
tweets = client.search_recent_tweets(query=query, max_results=10, tweet_fields=["public_metrics"])

data = []
for tweet in tweets.data:
    metrics = tweet.public_metrics
    data.append({
        "Tweet": tweet.text,
        "Likes": metrics["like_count"],
        "Retweets": metrics["retweet_count"]
    })
df_twitter = pd.DataFrame(data)
df_twitter.head()

In [None]:
# 3. Instagram API (Graph)
import requests

access_token = "VOTRE_TOKEN"
user_id = "UTILISATEUR_ID"
url = f"https://graph.instagram.com/{user_id}/media?fields=id,caption,media_url,like_count&access_token={access_token}"
response = requests.get(url)
data = response.json()["data"]

posts = [{"Caption": post.get("caption", ""), "Likes": post.get("like_count", 0), "Image": post.get("media_url", "")} for post in data]
df_insta = pd.DataFrame(posts)
df_insta.head()

In [None]:
# 4. YouTube API
from googleapiclient.discovery import build

api_key = "VOTRE_API_KEY"
youtube = build('youtube', 'v3', developerKey=api_key)

request = youtube.search().list(q="Python programming", part="snippet", type="video", maxResults=5)
response = request.execute()

videos = []
for item in response["items"]:
    videos.append({"Titre": item["snippet"]["title"], "Chaîne": item["snippet"]["channelTitle"]})
df_youtube = pd.DataFrame(videos)
df_youtube.head()

In [None]:
# 5. Google Search
from googlesearch import search

query = "Data Science cours gratuits"
results = list(search(query, num_results=10))
df_google = pd.DataFrame(results, columns=["Résultats"])
df_google.head()

In [None]:
# 6. Reddit API (PRAW)
import praw

reddit = praw.Reddit(
    client_id="VOTRE_ID",
    client_secret="VOTRE_SECRET",
    user_agent="testscript"
)

posts = []
for submission in reddit.subreddit("datascience").hot(limit=5):
    posts.append({"Titre": submission.title, "Votes": submission.score, "Commentaires": submission.num_comments})
df_reddit = pd.DataFrame(posts)
df_reddit.head()

In [None]:
# 7. Wikipedia API
import wikipedia

wikipedia.set_lang("fr")
content = wikipedia.page("Science des données").content
df_wiki = pd.DataFrame([{"Contenu": content}])
df_wiki.head(1)