In [3]:
import pandas as pd
import glob

In [4]:
# CSV files
csv_files = [
    "anxiety_2015_25.csv",
    "depression_2015_25.csv",
    "ptsd_2015_2025.csv",
    "suicide_2015_25.csv"
]

# column names 
standard_columns = [
    "forum_name",
    "thread_id",
    "post_content",
    "author",
    "post_timestamp",
    "no_of_comments",
    "comments_content",
    "authors_comment",
    "comment_authors"
]

In [5]:
merged_df = pd.DataFrame(columns=standard_columns)

for file in csv_files:
    try:
        df = pd.read_csv(file)

        # Align columns if necessary
        df = df[standard_columns] if set(standard_columns).issubset(df.columns) else df.rename(columns={
            'forum': 'forum_name',
            'id': 'thread_id',
            'post': 'post_content',
            'user': 'author',
            'post_timestamp':'post_timestamp',
            'no_of_comments':'no_of_comments',
            'comments': 'comments_content',
            'author_reply': 'authors_comment',
            'comment_authors': 'comment_authors'
        })

        # Add any missing columns
        for col in standard_columns:
            if col not in df.columns:
                df[col] = ""

        df = df[standard_columns]  # Reorder
        merged_df = pd.concat([merged_df, df], ignore_index=True)

    except Exception as e:
        print(f"ailed to read or process {file}: {e}")

# Save to final combined file
merged_df.to_csv("data_2013_2025.csv", index=False)
print(f"Merged CSV saved as 'data_2013_2025.csv' with {len(merged_df)} rows")

Merged CSV saved as 'data_2013_2025.csv' with 13794 rows


In [6]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13794 entries, 0 to 13793
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   forum_name        13794 non-null  object
 1   thread_id         13794 non-null  object
 2   post_content      13794 non-null  object
 3   author            13794 non-null  object
 4   post_timestamp    13794 non-null  object
 5   no_of_comments    13794 non-null  object
 6   comments_content  13794 non-null  object
 7   authors_comment   7669 non-null   object
 8   comment_authors   13794 non-null  object
dtypes: object(9)
memory usage: 970.0+ KB


In [7]:
merged_df.describe()

Unnamed: 0,forum_name,thread_id,post_content,author,post_timestamp,no_of_comments,comments_content,authors_comment,comment_authors
count,13794,13794,13794,13794,13794,13794,13794,7669,13794
unique,4,12782,12781,9518,12758,18,12782,7317,10930
top,anxiety,19693,I’ve been struggling with my life for the past...,Jeriava,10-07-2021 11:56 PM,2,Sophie_M: Wishing you a warm welcome to the fo...,"Geoff and John,Wow, I hope you both realise ho...",Sophie_M ||| jtjt_4862 ||| Summer Rose ||| LJp...
freq,5477,664,664,664,664,3983,664,208,664
