# Data collection

## Install and import packages

In [46]:
# 必要なパッケージをインストール
%pip install praw pandas tqdm matplotlib logging python-dotenv 

# パッケージの読み込み
import os
import datetime
from logging import getLogger
import praw
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Parameters

In [47]:
%load_ext dotenv
%dotenv


logger = getLogger(__name__)

# パラメータの宣言
REDDIT_CLIENT_ID=os.getenv('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET=os.getenv('REDDIT_CLIENT_SECRET')
REDDIT_USER_AGENT=os.getenv('REDDIT_USER_AGENT')
SUBREDDITS = ['Palestine', 'Israel', 'IsraelPalestine']  # 対象のサブレディットをリストとして定義
# SUBMISSIONS_CSV_FILE_NAME = 'submissions_Palestine_Israel_IsraelPalestine_20241026_173259.csv'  # 対象のサブレディットをリストとして定義
SUBMISSIONS_CSV_FILE_NAME = 'unmatched_rows.csv'  # 対象のサブレディットをリストとして定義

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


## Fetch posts

In [48]:
# 非同期でコメント取得する関数
def fetch_comments(submission_ids, reddit):
    comments_data = []

    # 進行度を表示するためにtqdmを使用
    with tqdm(total=len(submission_ids), desc="Fetching comments") as pbar:
        for submission_id in submission_ids:
            try:
                submission = reddit.submission(id=submission_id)
                submission.comments.replace_more(limit=None)  # すべてのコメントを展開

                for comment in submission.comments.list():
                    comment_info = {
                        "comment_id": comment.id,
                        "submission_id": submission_id,
                        "author": str(comment.author),
                        "body": comment.body,
                        "created_utc": comment.created_utc,
                        "score": comment.score,
                        "permalink": comment.permalink,
                        "parent_id": comment.parent_id,
                        "is_submitter": comment.is_submitter,
                        "subreddit": comment.subreddit.display_name
                    }
                    comments_data.append(comment_info)

                # 進行度の更新
                pbar.update(1)

            except Exception as e:
                logger.error(f"Error fetching comments for submission {submission_id}: {e}")
                break
    return pd.DataFrame(comments_data)

## Save data to csv

In [49]:
def save_to_csv(df):
    # 現在の日付を取得
    date_str = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    # サブレディット名を結合し、ファイル名を生成
    subreddit_str = '_'.join(SUBREDDITS)
    filename = f'comments_{subreddit_str}_{date_str}.csv'
    df.to_csv(filename, index=False)
    logger.info(f'Data saved to {filename}')


## Main

In [50]:
if __name__ == '__main__':
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )
    df = pd.read_csv(SUBMISSIONS_CSV_FILE_NAME)
    submission_ids = df['id'].tolist()
    comments_df = fetch_comments(submission_ids, reddit)
    save_to_csv(comments_df)

Fetching comments: 100%|██████████████████████████████████████████████████████████████| 851/851 [05:11<00:00,  2.74it/s]
