In [22]:
import sqlite3

# Supprimer l'ancienne base
import os
if os.path.exists('reddit_comments.db'):
    os.remove('reddit_comments.db')
    print("Base reddit_comments.db supprimée")
else:
    print("Aucune base existante à supprimer")

# Recréation de la base vide et des tables
conn = sqlite3.connect('reddit_comments.db')
cursor = conn.cursor()

# Recréation des tables
cursor.execute('''
CREATE TABLE AUTHOR (
    author TEXT PRIMARY KEY
)
''')

cursor.execute('''
CREATE TABLE SUBREDDIT (
    Subreddit_id TEXT PRIMARY KEY,
    subreddit TEXT NOT NULL
)
''')

cursor.execute('''
CREATE TABLE SCORE (
    id TEXT PRIMARY KEY,
    score INTEGER,
    ups INTEGER,
    downs INTEGER,
    score_hidden BOOLEAN,
    gilded INTEGER
)
''')

cursor.execute('''
CREATE TABLE REMOVAL (
    removal_reason TEXT PRIMARY KEY
)
''')

cursor.execute('''
CREATE TABLE CONTROVERSY (
    controversiality INTEGER PRIMARY KEY
)
''')

cursor.execute('''
CREATE TABLE DISTINGUISHED (
    distinguished TEXT PRIMARY KEY
)
''')

cursor.execute('''
CREATE TABLE PARENT (
    parent_id TEXT PRIMARY KEY,
    link_id TEXT NOT NULL
)
''')

cursor.execute('''
CREATE TABLE COMMENT (
    id TEXT PRIMARY KEY,
    created_utc TEXT,
    name TEXT,
    body TEXT,
    edited BOOLEAN,
    author_flair_css_class TEXT,
    author_flair_text TEXT,
    author TEXT,
    Subreddit_id TEXT,
    score_id TEXT,
    parent_id TEXT,
    removal_reason TEXT,
    controversiality INTEGER,
    distinguished TEXT,
    FOREIGN KEY (author) REFERENCES AUTHOR(author),
    FOREIGN KEY (Subreddit_id) REFERENCES SUBREDDIT(Subreddit_id),
    FOREIGN KEY (score_id) REFERENCES SCORE(id),
    FOREIGN KEY (parent_id) REFERENCES PARENT(parent_id),
    FOREIGN KEY (removal_reason) REFERENCES REMOVAL(removal_reason),
    FOREIGN KEY (controversiality) REFERENCES CONTROVERSY(controversiality),
    FOREIGN KEY (distinguished) REFERENCES DISTINGUISHED(distinguished)
)
''')

conn.commit()
conn.close()
print("Base réinitialisée et tables recréées")

Base reddit_comments.db supprimée
Base réinitialisée et tables recréées


In [None]:
import sqlite3
import pandas as pd

# Connexion à SQLite
conn = sqlite3.connect('reddit_comments.db')

# Pour pouvoir utiliser 'OR IGNORE' avec pandas
from sqlalchemy import create_engine
engine = create_engine('sqlite:///reddit_comments.db', echo=False)

# Dossier des fichiers CSV
data_path = 'data/excel/'

files_info = {
    'askreddit_author.csv': {'table': 'AUTHOR', 'columns': ['author']},
    'askreddit_comment.csv': {'table': 'COMMENT', 'columns': ['id', 'created_utc', 'name', 'body', 'edited', 'author_flair_css_class', 'author_flair_text', 'author', 'Subreddit_id', 'score_id', 'parent_id', 'removal_reason', 'controversiality', 'distinguished']},
    'askreddit_controverse.csv': {'table': 'CONTROVERSY', 'columns': ['controversiality']},
    'askreddit_depends.csv': {'table': 'PARENT', 'columns': ['parent_id', 'link_id']},
    'askreddit_distinguihshed.csv': {'table': 'DISTINGUISHED', 'columns': ['distinguished']},
    'askreddit_is_distinguihshed.csv': {'table': 'COMMENT', 'columns': None},
    'askreddit_parent.csv': {'table': 'PARENT', 'columns': ['parent_id', 'link_id']},
    'askreddit_removal.csv': {'table': 'REMOVAL', 'columns': ['removal_reason']},
    'askreddit_removed.csv': {'table': 'COMMENT', 'columns': None},
    'askreddit_score.csv': {'table': 'SCORE', 'columns': ['id', 'score', 'ups', 'downs', 'score_hidden', 'gilded']},
    'askreddit_subreddit.csv': {'table': 'SUBREDDIT', 'columns': ['Subreddit_id', 'subreddit']}
}

for file, info in files_info.items():
    df = pd.read_csv(f'{data_path}{file}')
    table = info['table']
    expected_columns = info['columns']

    if expected_columns:
        # Sélectionne uniquement les colonnes communes entre CSV et table SQL
        common_cols = [col for col in expected_columns if col in df.columns]
        df = df[common_cols]
        df = df.drop_duplicates()
        df.to_sql(table, con=engine, if_exists='append', index=False, method='multi', chunksize=500)
        print(f"✅ {file} inséré dans {table} (colonnes présentes: {common_cols})")
    else:
        print(f"⚠️ {file} ignoré car déjà pris en compte via d'autres tables")

# Fermer la connexion
conn.close()

✅ askreddit_author.csv inséré dans AUTHOR (colonnes présentes: ['author'])
