# Introduction
This notebook collects posts and comments related to climate change from the France sub reddit. 
It passes the keyword list as small batches of 2 or 3 keywords.
Posts or comments whose id is already in the dataframe are passed, removing duplicates.


In [35]:
import praw
import fasttext
import datetime as dt
import pandas as pd
import unicodedata
import time

# 1. Authenticate
reddit = praw.Reddit(
    client_id="",         
    client_secret="",  
    user_agent=""
)

In [None]:

# Getting the french fasttext model
import urllib.request, fasttext
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
urllib.request.urlretrieve(url, "lid.176.ftz")

model = fasttext.load_model("lid.176.ftz")
print(model.predict("Le camembert est bon"))

In [33]:
# ── 1. Target subreddit ───────────────────────────────────────────────────
SUB = "france"

# ── 2. Climate keyword batches (<512 chars each) ──────────────────────────
BATCHES = [
    ["climat", '"effet de serre"', "carbone"],
    ["renouvelable", "énergie", "durable", "durabilité"],
    ["canicule", "canicules"],
    ["écologie", "écologique"],
    ["éco-conscient", '"eco-friendly"'],
    ["environnement", "environnemental", "environnementale",
     "CO2", '"montée des eaux"', '"niveau de la mer"',
     '"événements météorologiques extrêmes"']
]

# ── 3. FastText language detector ─────────────────────────────────────────
ft = fasttext.load_model("lid.176.ftz")    

def is_french(text: str, threshold: float = 0.60) -> bool:
    """
    Returns True if `text` is French with probability > threshold.
    - strips accents & newlines (fastText needs one-line input)
    """
    cleaned = unicodedata.normalize("NFKD", text.lower())\
                          .encode("ascii", "ignore").decode()\
                          .replace("\n", " ").strip()
    if not cleaned:
        return False
    label, prob = ft.predict(cleaned, k=1)   # (['__label__fr'], [0.95])
    return label[0] == "__label__fr" and prob[0] > threshold

# ── 4. Harvest loop ───────────────────────────────────────────────────────
rows, seen_ids = [], set()
sr = reddit.subreddit(SUB)

for terms in BATCHES:
    query = " OR ".join(terms)
    print(f"Querying r/{SUB}: {query}")
    
    for post in sr.search(query, sort="new", time_filter="year", limit=None):
        if post.id in seen_ids:
            continue
        seen_ids.add(post.id)

        post_text = f"{post.title}\n{post.selftext or ''}"
        if not is_french(post_text):
            continue

        # save submission
        rows.append({
            "kind"     : "post",
            "id"       : post.id,
            "parent_id": "",
            "created"  : dt.datetime.utcfromtimestamp(post.created_utc),
            "subreddit": SUB,
            "body"     : post_text,
            "score"    : post.score,
            "url"      : post.url
        })

        # pull full comment tree
        try:
            post.comments.replace_more(limit=None)
        except Exception as e:
            print(f"replace_more failed on {post.id}: {e}")
            continue

        for com in post.comments.list():
            if com.id in seen_ids:
                continue
            seen_ids.add(com.id)

            if not is_french(com.body):
                continue

            rows.append({
                "kind"     : "comment",
                "id"       : com.id,
                "parent_id": com.parent_id.split("_")[1],  # strip t1_/t3_
                "created"  : dt.datetime.utcfromtimestamp(com.created_utc),
                "subreddit": SUB,
                "body"     : com.body,
                "score"    : com.score
            })
    
    time.sleep(1.1)   # stay under 60 requests/minute

# ── 5. Save to CSV ────────────────────────────────────────────────────────
df = pd.DataFrame(rows)
df.to_csv("r_france_climate_posts_comments.csv", index=False)
print(f"Saved {len(df):,} rows to r_france_climate_posts_comments.csv")

Querying r/france: climat OR "effet de serre" OR carbone


  "created"  : dt.datetime.utcfromtimestamp(post.created_utc),
  "created"  : dt.datetime.utcfromtimestamp(com.created_utc),


Querying r/france: renouvelable OR énergie OR durable OR durabilité
Querying r/france: canicule OR canicules
Querying r/france: écologie OR écologique
Querying r/france: éco-conscient OR "eco-friendly"
Querying r/france: environnement OR environnemental OR environnementale OR CO2 OR "montée des eaux" OR "niveau de la mer" OR "événements météorologiques extrêmes"
Saved 26,733 rows to r_france_climate_posts_comments.csv
