# Introduction
This notebook collects posts and comments related to climate change from the France sub reddit. 
It passes the keyword list as small batches of 2 or 3 keywords.
Posts or comments whose id is already in the dataframe are passed, removing duplicates.


In [5]:
import praw
import fasttext
import datetime as dt
import pandas as pd
import unicodedata
import time

# 1. Authenticate
reddit = praw.Reddit(
    client_id="",         
    client_secret="",  
    user_agent=""
)

In [3]:

# Getting the french fasttext model
import urllib.request, fasttext
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
urllib.request.urlretrieve(url, "lid.176.ftz")

model = fasttext.load_model("lid.176.ftz")
print(model.predict("Le camembert est bon"))

(('__label__fr',), array([0.80718303]))


In [6]:
# ── 1. Subreddit & keyword batches ────────────────────────────────────────
SUB = "france"

BATCHES = [
    ["climat", '"effet de serre"', "carbone"],
    ["renouvelable", "énergie", "durable", "durabilité"],
    ["canicule", "canicules"],
    ["écologie", "écologique"],
    ["éco-conscient", '"eco-friendly"'],
    ["environnement", "environnemental", "environnementale",
     "CO2", '"montée des eaux"', '"niveau de la mer"',
     '"événements météorologiques extrêmes"']
]

# flat set of lowercase, accent-less keywords for comment filtering
KW_SET = {
    unicodedata.normalize("NFKD", w.strip('"').lower())
    .encode("ascii", "ignore").decode()
    for batch in BATCHES for w in batch
}

def contains_keyword(text: str) -> bool:
    """Return True if any keyword appears in `text` (accent-insensitive)."""
    norm = unicodedata.normalize("NFKD", text.lower())\
                       .encode("ascii", "ignore").decode()
    return any(kw in norm for kw in KW_SET)

# ── 2. FastText French detector ───────────────────────────────────────────
ft = fasttext.load_model("lid.176.ftz")    # 0.9 MB, local file

def is_french(text: str, thr: float = 0.60) -> bool:
    clean = unicodedata.normalize("NFKD", text.lower())\
                       .encode("ascii", "ignore").decode()\
                       .replace("\n", " ").strip()
    if not clean:
        return False
    label, prob = ft.predict(clean, k=1)
    return label[0] == "__label__fr" and prob[0] > thr

# ── 3. Crawl ──────────────────────────────────────────────────────────────
rows, seen = [], set()
sr = reddit.subreddit(SUB)

for terms in BATCHES:
    query = " OR ".join(terms)
    print(f"🔎  Searching r/{SUB}: {query}")
    for post in sr.search(query, sort="new", time_filter="year", limit=None):
        if post.id in seen:
            continue
        seen.add(post.id)

        post_text = f"{post.title}\n{post.selftext or ''}"
        if not is_french(post_text):
            continue   # skip English or other languages

        # ─ save submission ─
        rows.append({
            "kind"     : "post",
            "id"       : post.id,
            "parent_id": "",
            "created"  : dt.datetime.utcfromtimestamp(post.created_utc),
            "subreddit": SUB,
            "body"     : post_text,
            "score"    : post.score,
            "url"      : post.url
        })

        # ─ walk comment tree ─
        try:
            post.comments.replace_more(limit=None)
        except Exception as e:
            print(f"⚠️ replace_more failed on {post.id}: {e}")
            continue

        for com in post.comments.list():
            if com.id in seen:
                continue
            seen.add(com.id)

            if not is_french(com.body):
                continue
            if not contains_keyword(com.body):     # ← NEW filter
                continue

            rows.append({
                "kind"     : "comment",
                "id"       : com.id,
                "parent_id": com.parent_id.split("_")[1],  # strip t1_/t3_
                "created"  : dt.datetime.utcfromtimestamp(com.created_utc),
                "subreddit": SUB,
                "body"     : com.body,
                "score"    : com.score
            })

    time.sleep(1.1)   # polite spacing (<60 req/min)

# ── 4. Save ───────────────────────────────────────────────────────────────
df = pd.DataFrame(rows)
df.to_csv("r_france_climate_posts_comments.csv", index=False)
print(f"✅  Saved {len(df):,} on-topic French rows to r_france_climate_posts_comments_V2.csv")

🔎  Searching r/france: climat OR "effet de serre" OR carbone


  "created"  : dt.datetime.utcfromtimestamp(post.created_utc),
  "created"  : dt.datetime.utcfromtimestamp(com.created_utc),


🔎  Searching r/france: renouvelable OR énergie OR durable OR durabilité
🔎  Searching r/france: canicule OR canicules
🔎  Searching r/france: écologie OR écologique
🔎  Searching r/france: éco-conscient OR "eco-friendly"
🔎  Searching r/france: environnement OR environnemental OR environnementale OR CO2 OR "montée des eaux" OR "niveau de la mer" OR "événements météorologiques extrêmes"
✅  Saved 3,629 on-topic French rows to r_france_climate_posts_comments_V2.csv
