In [3]:
# csv_to_sqlite.py
import sqlite3
import csv
import os
from datetime import datetime

# Configuration
CSV_FILE = "../data/netflix_titles.csv"
DB_FILE = "../data/netflix.db"

def create_db():
    """Cr√©e la base de donn√©es et la table avec le sch√©ma adapt√©"""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()

    cursor.execute("""
    CREATE TABLE IF NOT EXISTS shows (
        show_id TEXT PRIMARY KEY,
        type TEXT NOT NULL,
        title TEXT NOT NULL,
        director TEXT,
        cast TEXT,
        country TEXT,
        date_added TEXT,
        release_year INTEGER,
        rating TEXT,
        duration TEXT,
        listed_in TEXT,
        description TEXT,
        added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
    """)

    # Cr√©er un index pour les requ√™tes fr√©quentes
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_type ON shows(type)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_country ON shows(country)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_year ON shows(release_year)")

    conn.commit()
    conn.close()

def clean_data(value):
    """Nettoie les valeurs du CSV (remplace les vides par NULL)"""
    return None if value == "" else value

def import_csv():
    """Import les donn√©es du CSV en g√©rant les valeurs vides"""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()

    with open(CSV_FILE, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)

        for i, row in enumerate(csv_reader):
            # Nettoyer les donn√©es
            clean_row = {k: clean_data(v) for k, v in row.items()}

            # Convertir la date au format ISO
            try:
                date_obj = datetime.strptime(clean_row['date_added'], "%B %d, %Y")
                clean_row['date_added'] = date_obj.strftime("%Y-%m-%d")
            except:
                pass  # Garder la date originale si le format est invalide

            # Ins√©rer dans la base
            cursor.execute("""
            INSERT OR IGNORE INTO shows (
                show_id, type, title, director, cast, country,
                date_added, release_year, rating, duration,
                listed_in, description
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                clean_row['show_id'],
                clean_row['type'],
                clean_row['title'],
                clean_row['director'],
                clean_row['cast'],
                clean_row['country'],
                clean_row['date_added'],
                int(clean_row['release_year']) if clean_row['release_year'] else None,
                clean_row['rating'],
                clean_row['duration'],
                clean_row['listed_in'],
                clean_row['description']
            ))

            if i % 1000 == 0:
                conn.commit()
                print(f"Import√© {i} lignes...")

    conn.commit()
    conn.close()
    print(f"‚úÖ Import termin√©: {i+1} lignes import√©es dans {DB_FILE}")

def verify_data():
    """V√©rifie les donn√©es import√©es"""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()

    # Compter les entr√©es
    cursor.execute("SELECT COUNT(*) FROM shows")
    count = cursor.fetchone()[0]
    print(f"\nüìä Statistiques:")
    print(f"- Total: {count} films/s√©ries")

    # R√©partition par type
    cursor.execute("SELECT type, COUNT(*) FROM shows GROUP BY type")
    for type_, count in cursor:
        print(f"- {type_}: {count}")

    # Ann√©es de sortie
    cursor.execute("SELECT MIN(release_year), MAX(release_year) FROM shows")
    min_year, max_year = cursor.fetchone()
    print(f"- P√©riode: {min_year} √† {max_year}")

    # Pays les plus repr√©sent√©s
    cursor.execute("SELECT country, COUNT(*) FROM shows GROUP BY country ORDER BY COUNT(*) DESC LIMIT 3")
    print("\nüåç Top 3 pays:")
    for country, count in cursor:
        print(f"- {country}: {count}")

    conn.close()

if __name__ == "__main__":
    if not os.path.exists(CSV_FILE):
        print(f"‚ùå Fichier {CSV_FILE} introuvable!")
    else:
        print("üîß Cr√©ation de la base de donn√©es...")
        create_db()
        print("üì§ Import des donn√©es...")
        import_csv()
        verify_data()


üîß Cr√©ation de la base de donn√©es...
üì§ Import des donn√©es...
Import√© 0 lignes...
Import√© 1000 lignes...
Import√© 2000 lignes...
Import√© 3000 lignes...
Import√© 4000 lignes...
Import√© 5000 lignes...
Import√© 6000 lignes...
Import√© 7000 lignes...
Import√© 8000 lignes...
‚úÖ Import termin√©: 8807 lignes import√©es dans ../data/netflix.db

üìä Statistiques:
- Total: 8807 films/s√©ries
- Movie: 6131
- TV Show: 2676
- P√©riode: 1925 √† 2021

üåç Top 3 pays:
- United States: 2818
- India: 972
- None: 831
