In [5]:
import pandas as pd
import os
import json
from tqdm import tqdm

# 1. CED

In [76]:
# Set paths
base_path = r'E:\social media datasets\CED\Chinese_Rumor_Dataset-master\CED_Dataset'  # Replace with actual base path
original_path = os.path.join(base_path, 'original-microblog')
repost_paths = {
    'rumor': os.path.join(base_path, 'rumor-repost'),
    'nonrumor': os.path.join(base_path, 'non-rumor-repost')
}


In [77]:
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_ced_original_posts(original_path, repost_paths):
    original_records = []

    for filename in os.listdir(original_path):
        if not filename.endswith('.json'):
            continue

        microblog_id = filename.replace('.json', '')
        original_file = os.path.join(original_path, filename)

        try:
            data = load_json(original_file)

            user = data.get("user", {})
            if not isinstance(user, dict):
                user = {}

            label = None
            for lbl, path in repost_paths.items():
                if os.path.exists(os.path.join(path, filename)):
                    label = lbl
                    break
            if label is None:
                continue

            original_records.append({
            "id": microblog_id,
            "text": data.get("text", ""),
            "time": data.get("time", None),
            "followers": user.get("followers", None),
            "friends": user.get("friends", None),
            "verified": user.get("verified", False),
            "reposts": data.get("reposts", 0),
            "likes": data.get("likes", 0),
            "label": label,
            })

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    return pd.DataFrame(original_records)

In [78]:
ced_original = load_ced_original_posts(original_path, repost_paths)
ced_original.head()

Unnamed: 0,id,text,time,followers,friends,verified,reposts,likes,label
0,0_yBmepBtUB_2279086572,人间惨剧：今天下午约14点，宁波妇儿医院，一妇女携带一婴儿在住院楼跳楼，后抢救无效死亡。具体...,1347334462,227833.0,907.0,False,225,0,rumor
1,1000_yFDQ0ffqy_2082357197,再去武大，已无牌坊！非要拆掉？@章立凡 @袁裕来律师 @老徐时评 @徐昕 @杨锦麟 @左小祖...,1349768694,59624.0,1923.0,False,395,0,rumor
2,1002_zne7v6NMq_2464670392,"中国最美丽的乡村""江西婺源""一""教师打死学生"" 昨晚，在被誉为中国最美丽的乡村江西省婺源县清...",1363146924,66.0,82.0,False,685,0,rumor
3,1003_AcDAZonRf_1603208240,忍者QS：江苏省东海县女镇党委书记徐艳，因不愿陪县委书记关永健上床，竟然被警察毒打致子宫破裂...,1380971862,8358.0,1474.0,False,120,6,rumor
4,1004_zvUcLCeTm_3186420187,《北大猛男，持刀刺官！！！》“可歌可泣”的是王同学投案自首之后冷冷说了一句话是 “我并不后悔...,1368078370,1096.0,1118.0,False,532,11,rumor


In [23]:
ced_original.label.unique() 


array(['rumor', 'nonrumor'], dtype=object)

In [24]:
ced_original.label.value_counts()

label
nonrumor    1849
rumor       1538
Name: count, dtype: int64

In [60]:
print(ced_original['time'].dtype)

object


In [25]:
def load_ced_repost_posts(repost_paths, original_path):
    repost_records = []

    for filename in os.listdir(original_path):
        if not filename.endswith('.json'):
            continue

        microblog_id = filename.replace('.json', '')
        repost_file = None
        label = None

        for lbl, path in repost_paths.items():
            candidate = os.path.join(path, filename)
            if os.path.exists(candidate):
                repost_file = candidate
                label = lbl
                break

        if repost_file is None:
            continue

        try:
            repost_data = load_json(repost_file)
            for repost in repost_data:
                repost_records.append({
                    "id": microblog_id,
                    "text": repost.get("text", ""),

                    "label": label
                })

        except Exception as e:
            print(f"Error processing repost file {filename}: {e}")

    return pd.DataFrame(repost_records)


In [27]:
ced_reposts = load_ced_repost_posts(repost_paths, original_path)
ced_reposts.head()

Unnamed: 0,id,text,label
0,0_yBmepBtUB_2279086572,,rumor
1,0_yBmepBtUB_2279086572,,rumor
2,0_yBmepBtUB_2279086572,现实残酷加产后抑郁。。。以前听教授讲过，很多产科医生对抑郁症认识不足。,rumor
3,0_yBmepBtUB_2279086572,不实报道和不好报道，不能仅用逻辑判断。未经证实的报道，不转发，不评论，不判断，才是所谓公知们...,rumor
4,0_yBmepBtUB_2279086572,难道这样的悲剧，号称我们的父母官没有责任？,rumor


In [45]:
ced_reposts.label.value_counts()

label
nonrumor    791563
rumor       483617
Name: count, dtype: int64

# 2. FbMultiLingMisinfo

In [86]:
fbmulti = pd.read_csv(r'E:\social media datasets\Fbmultilingual.csv')
fbmulti.head()

Unnamed: 0.1,Unnamed: 0,id,counter,dataset,timestamp_first_tweet,timestamp_last_tweet,date_first_tweet,date_last_tweet,days_betweet_first_and_last_post,duplicate_cluster,...,total_tweets_after_1_hours,total_tweets_after_2_hours,total_tweets_after_5_hours,total_tweets_after_10_hours,total_tweets_after_15_hours,total_tweets_after_24_hours,total_tweets_after_50_hours,total_tweets_after_120_hours,total_tweets_after_480_hours,month_year
0,31032,politifact11709,1,poli,1180900192,1538510995,03/06/2007,02/10/2018,4139,26025,...,1,1,2,2,2,3,3,3,4,2007-03
1,31424,politifact773,1,poli,1185915666,1544929901,31/07/2007,16/12/2018,4156,26391,...,1,1,1,1,1,1,1,1,1,2007-07
2,31310,politifact150,1,poli,1186670987,1544235921,09/08/2007,08/12/2018,4139,26283,...,2,2,2,2,2,2,2,2,2,2007-09
3,31628,politifact86,1,poli,1187924695,1510206606,24/08/2007,09/11/2017,3730,26571,...,1,1,1,1,1,1,1,1,1,2007-08
4,31444,politifact215,1,poli,1190842571,1544804809,26/09/2007,14/12/2018,4097,26409,...,2,2,2,2,2,2,3,4,4,2007-09


In [17]:
fbmulti.tpfc_rating_encoding.unique()

array([0, 1], dtype=int64)

In [19]:
fbmulti.tpfc_rating_encoding.value_counts()

tpfc_rating_encoding
0    17182
1     8269
Name: count, dtype: int64

In [64]:
print(fbmulti['timestamp_first_tweet'].dtype)

int64


# 3. MediaEval15

In [56]:
def load_mediaeval15(folder_path):
    # Load tweet data
    tweet_cols = ['tweetId', 'tweetText', 'userId', 'imageId', 'username', 'timestamp', 'label']
    tweets_dev = pd.read_csv(os.path.join(folder_path, 'tweets_dev.txt'), sep='\t', names=tweet_cols, header=0)
    tweets_test = pd.read_csv(os.path.join(folder_path, 'tweets_test.txt'), sep='\t', names=tweet_cols, header=0)
    tweets = pd.concat([tweets_dev, tweets_test], ignore_index=True)

    # Load user features
    user_cols = ['tweetId', 'num_friends', 'num_followers', 'folfriend_ratio', 'times_listed', 'has_url', 'is_verified', 'num_tweets']
    user_dev = pd.read_csv(os.path.join(folder_path, 'user_features_dev.csv'), skipinitialspace=True)
    user_test = pd.read_csv(os.path.join(folder_path, 'user_features_test.txt'), names=user_cols, header=0)
    user_features = pd.concat([user_dev, user_test], ignore_index=True)

    # Load tweet-level features (e.g. retweets)
    tweet_feat_cols = ['tweetId', 'num_words', 'text_length', 'contains_questmark', 'num_questmark',
                       'contains_exclammark', 'num_exclammark', 'contains_happyemo', 'contains_sademo',
                       'contains_firstorderpron', 'contains_secondorderpron', 'contains_thirdorderpron',
                       'num_uppercasechars', 'num_possentiwords', 'num_negsentiwords', 'num_mentions',
                       'num_hashtags', 'num_URLs', 'num_retweets']
    
    tweet_feats_test = pd.read_csv(os.path.join(folder_path, 'tweet_features_test.txt'), names=tweet_feat_cols, header=0)
    tweet_feats_dev = pd.read_csv(os.path.join(folder_path, 'tweet_features_dev.csv'), skipinitialspace=True)
    tweet_features = pd.concat([tweet_feats_test, tweet_feats_dev], ignore_index=True)

    # Merge all
    df = tweets.merge(user_features, on='tweetId', how='left')
    df = df.merge(tweet_features[['tweetId', 'num_retweets']], on='tweetId', how='left')

    # Keep only required columns
    final_df = df[['tweetId', 'tweetText', 'timestamp', 'label', 'username',
                   'num_followers', 'num_friends', 'is_verified', 'num_retweets']]

    return final_df

In [57]:
dataset_folder = r'D:\text+image\text+image\mediaeval2015'
mediaeval15 = load_mediaeval15(dataset_folder)


In [58]:
mediaeval15.tail()

Unnamed: 0,tweetId,tweetText,timestamp,label,username,num_followers,num_friends,is_verified,num_retweets
18027,578433150071775232,Un présentateur de la ZDF confesse avoir truqu...,Thu Mar 19 05:49:44 +0000 2015,fake,Cdt_Sylvestre,774.0,498.0,False,0.0
18028,578433646597656576,"Oh les kleine menteurs ""@CorineBarella: Un pré...",Thu Mar 19 05:51:42 +0000 2015,fake,damomarc,1239.0,1994.0,False,0.0
18029,578486910491996160,Este es el programa de ZDF en el que confirman...,Thu Mar 19 09:23:21 +0000 2015,fake,javierpascual,4545.0,975.0,False,0.0
18030,578505023912591360,11.34 - wir haben FAST Mittag ▶ Riesen Verwirr...,Thu Mar 19 10:35:20 +0000 2015,fake,aotto1968_2,4896.0,3498.0,False,0.0
18031,578305144380612609,"Sorry, @yanisvaroufakis! https://t.co/BSkYrbII...",Wed Mar 18 21:21:05 +0000 2015,fake,janboehm,188561.0,286.0,True,0.0


In [62]:
mediaeval15.columns

Index(['tweetId', 'tweetText', 'timestamp', 'label', 'username',
       'num_followers', 'num_friends', 'is_verified', 'num_retweets'],
      dtype='object')

In [41]:
mediaeval15.label.unique()

array(['fake', 'humor', 'real'], dtype=object)

In [42]:
mediaeval15.label.value_counts()

label
fake     9288
real     6130
humor    2614
Name: count, dtype: int64

# 4. PHEME-5

In [68]:
from glob import glob

In [69]:
def load_pheme5_original(main_folder):
    data = []

    # Traverse event folders
    for event_name in os.listdir(main_folder):
        event_path = os.path.join(main_folder, event_name)
        if not os.path.isdir(event_path):
            continue

        for label_type in ['rumours', 'non-rumours']:
            label_path = os.path.join(event_path, label_type)
            if not os.path.isdir(label_path):
                continue

            for tweet_folder in os.listdir(label_path):
                tweet_dir = os.path.join(label_path, tweet_folder, 'source-tweet')
                if not os.path.exists(tweet_dir):
                    continue

                # Load JSON file in the source-tweet folder
                for json_file in glob(os.path.join(tweet_dir, '*.json')):
                    try:
                        with open(json_file, 'r', encoding='utf-8') as f:
                            tweet = json.load(f)

                        user = tweet.get('user', {})
                        data.append({
                            'id': tweet.get('id'),
                            'text': tweet.get('text'),
                            'created_at': tweet.get('created_at'),
                            'label': label_type,
                            'followers_count': user.get('followers_count'),
                            'friends_count': user.get('friends_count'),
                            'verified': user.get('verified'),
                            'retweet_count': tweet.get('retweet_count'),
                            'favorite_count': tweet.get('favorite_count')
                        })
                    except Exception as e:
                        print(f"Error reading {json_file}: {e}")

    df = pd.DataFrame(data)
    return df

In [70]:
pheme_df = load_pheme5_original(r'D:\text datasets\text datasets\phemernrdataset\pheme-rnr-dataset')
pheme_df.head()

Unnamed: 0,id,text,created_at,label,followers_count,friends_count,verified,retweet_count,favorite_count
0,552783238415265792,"Breaking: At least 10 dead, 5 injured after tO...",Wed Jan 07 11:06:08 +0000 2015,rumours,1628,246,False,159,14
1,552783667052167168,France: 10 people dead after shooting at HQ of...,Wed Jan 07 11:07:51 +0000 2015,rumours,129573,337,True,486,38
2,552783745565347840,Ten killed in shooting at headquarters of Fren...,Wed Jan 07 11:08:09 +0000 2015,rumours,529882,3051,True,127,15
3,552784168849907712,BREAKING: 10 dead in shooting at headquarters ...,Wed Jan 07 11:09:50 +0000 2015,rumours,499741,31,True,105,15
4,552784526955806720,Reuters: 10 people shot dead at headquarters o...,Wed Jan 07 11:11:16 +0000 2015,rumours,1377384,6,True,412,32


In [31]:
pheme_df.label.unique()

array(['rumours', 'non-rumours'], dtype=object)

In [32]:
pheme_df.label.value_counts()

label
non-rumours    3830
rumours        1972
Name: count, dtype: int64

In [33]:
def load_pheme_reposts(main_folder):
    repost_data = []

    for event_name in os.listdir(main_folder):
        event_path = os.path.join(main_folder, event_name)
        if not os.path.isdir(event_path):
            continue

        for label_type in ['rumours', 'non-rumours']:
            label_path = os.path.join(event_path, label_type)
            if not os.path.isdir(label_path):
                continue

            for tweet_folder in os.listdir(label_path):
                reactions_dir = os.path.join(label_path, tweet_folder, 'reactions')
                if not os.path.exists(reactions_dir):
                    continue

                for json_file in glob(os.path.join(reactions_dir, '*.json')):
                    try:
                        with open(json_file, 'r', encoding='utf-8') as f:
                            tweet = json.load(f)

                        original_id = tweet.get('in_reply_to_status_id')
                        if original_id is None:
                            continue  # skip if it's not a reply

                        repost_data.append({
                            'id': str(original_id),          # original tweet ID
                            'text': tweet.get('text', ''),
                            'label': label_type
                        })
                    except Exception as e:
                        print(f"Error reading {json_file}: {e}")

    return pd.DataFrame(repost_data)

In [34]:
pheme_repost = load_pheme_reposts(r'E:\social media datasets\pheme5\pheme-rnr-dataset')
pheme_repost.head()

Unnamed: 0,id,text,label
0,552783238415265792,@H_E_Samuel @George_Berridge @michael_taggart ...,rumours
1,552783238415265792,@H_E_Samuel Hi Henry would you be willing to g...,rumours
2,552783238415265792,@H_E_Samuel @H_E_Samuel please call them terro...,rumours
3,552783238415265792,@H_E_Samuel French govt needs to take strict a...,rumours
4,552787794503143424,@EdwardBowden @H_E_Samuel @George_Berridge @mi...,rumours


# 5. PHEME-9

In [73]:
def load_pheme9(pheme_root):

    data = []
    threads_root = os.path.join(pheme_root, "threads")

    for lang in os.listdir(threads_root):  # 'en', 'de'
        lang_dir = os.path.join(threads_root, lang)
        if not os.path.isdir(lang_dir):
            continue

        for event in os.listdir(lang_dir):
            event_path = os.path.join(lang_dir, event)
            if not os.path.isdir(event_path):
                continue

            for tweet_folder in os.listdir(event_path):
                tweet_path = os.path.join(event_path, tweet_folder)
                if not os.path.isdir(tweet_path):
                    continue

                try:
                    source_tweet_dir = os.path.join(tweet_path, "source-tweets")
                    annotation_file = os.path.join(tweet_path, "annotation.json")

                    tweet_files = os.listdir(source_tweet_dir)
                    if not tweet_files:
                        continue

                    tweet_file = os.path.join(source_tweet_dir, tweet_files[0])

                    with open(tweet_file, 'r', encoding='utf-8') as f:
                        tweet = json.load(f)

                    with open(annotation_file, 'r', encoding='utf-8') as f:
                        annotation = json.load(f)

                    data.append({
                        "id": tweet.get("id_str", tweet.get("id")),
                        "text": tweet.get("text"),
                        "created_at": tweet.get("created_at"),
                        "label": annotation.get("is_rumour", "unknown"),
                        "followers_count": tweet.get("user", {}).get("followers_count"),
                        "friends_count": tweet.get("user", {}).get("friends_count"),
                        "verified": tweet.get("user", {}).get("verified"),
                        "retweet_count": tweet.get("retweet_count"),
                        "favorite_count": tweet.get("favorite_count"),
                        "language": lang
                    })

                except Exception as e:
                    print(f"Error in {tweet_path}: {e}")

    return pd.DataFrame(data)

In [74]:
pheme9_df = load_pheme9(r'D:\text+image\text+image\pheme9\pheme-rumour-scheme-dataset')

pheme9_df.head()

Unnamed: 0,id,text,created_at,label,followers_count,friends_count,verified,retweet_count,favorite_count,language
0,580319406301020160,Flugzeug des Typs A320 ist laut Medienberichte...,Tue Mar 24 10:45:02 +0000 2015,rumour,600503,240,True,87,16,de
1,580321495987146752,Unglück: Germanwings-Airbus stürzt in Südfrank...,Tue Mar 24 10:53:21 +0000 2015,rumour,434364,202,True,234,72,de
2,580321586391175168,+++EIL+++ In Frankreich ist eine Germanwings-M...,Tue Mar 24 10:53:42 +0000 2015,rumour,102662,4174,True,26,5,de
3,580323290486611968,Nach dem Absturz der #Germanwings-Maschine kön...,Tue Mar 24 11:00:28 +0000 2015,rumour,670061,263,True,54,15,de
4,580323743202934785,FULL STORY Germanwings-operated Airbus A320 fl...,Tue Mar 24 11:02:16 +0000 2015,rumour,13605,533,True,83,12,de


In [37]:
pheme9_df.label.unique()

array(['rumour'], dtype=object)

In [38]:
pheme9_df.label.value_counts()

label
rumour    330
Name: count, dtype: int64

In [39]:
def load_pheme9_reposts(pheme_root):
    data = []
    threads_root = os.path.join(pheme_root, "threads")

    for lang in os.listdir(threads_root):  # 'en', 'de', etc.
        lang_dir = os.path.join(threads_root, lang)
        if not os.path.isdir(lang_dir):
            continue

        for event in os.listdir(lang_dir):
            event_path = os.path.join(lang_dir, event)
            if not os.path.isdir(event_path):
                continue

            for tweet_folder in os.listdir(event_path):
                tweet_path = os.path.join(event_path, tweet_folder)
                if not os.path.isdir(tweet_path):
                    continue

                reactions_dir = os.path.join(tweet_path, "reactions")
                annotation_file = os.path.join(tweet_path, "annotation.json")

                try:
                    with open(annotation_file, 'r', encoding='utf-8') as f:
                        annotation = json.load(f)
                    label = annotation.get("is_rumour", "unknown")
                except Exception as e:
                    print(f"Error reading annotation in {tweet_path}: {e}")
                    label = "unknown"

                if not os.path.isdir(reactions_dir):
                    continue

                for file in os.listdir(reactions_dir):
                    if not file.endswith('.json'):
                        continue

                    file_path = os.path.join(reactions_dir, file)
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            tweet = json.load(f)

                        parent_id = tweet.get("in_reply_to_status_id")
                        if parent_id is None:
                            continue

                        data.append({
                            "id": str(parent_id),  # original tweet ID
                            "text": tweet.get("text", ""),
                            "label": label,
                            "language": lang
                        })

                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

    return pd.DataFrame(data)


In [40]:
pheme9_repost = load_pheme9_reposts(r'E:\social media datasets\pheme9\pheme-rumour-scheme-dataset')

pheme9_repost.head()

Unnamed: 0,id,text,label,language
0,580319406301020160,@tagesschau Mein Gott wie grausam!!! Ich hoffe...,rumour,de
1,580319406301020160,Unterschiede:\n\n@tagesschau und @SZ: Offenbar...,rumour,de
2,580319406301020160,@tagesschau ich bin geschockt und kann es nich...,rumour,de
3,580319406301020160,@tagesschau schrecklich ich wünsche allen Ang...,rumour,de
4,580321495987146752,An Bord der abgestürzten Maschine befanden sic...,rumour,de


# 6. PHEME-Veracity

In [77]:
def load_phemeveracity(main_folder):
    data = []

    # Traverse event folders
    for event_name in os.listdir(main_folder):
        event_path = os.path.join(main_folder, event_name)
        if not os.path.isdir(event_path):
            continue

        for label_type in ['rumours', 'non-rumours']:
            label_path = os.path.join(event_path, label_type)
            if not os.path.isdir(label_path):
                continue

            for tweet_folder in os.listdir(label_path):
                tweet_dir = os.path.join(label_path, tweet_folder, 'source-tweets')
                if not os.path.exists(tweet_dir):
                    continue

                # Load JSON file in the source-tweet folder
                for json_file in glob(os.path.join(tweet_dir, '*.json')):
                    try:
                        with open(json_file, 'r', encoding='utf-8') as f:
                            tweet = json.load(f)

                        user = tweet.get('user', {})
                        data.append({
                            'id': tweet.get('id'),
                            'text': tweet.get('text'),
                            'created_at': tweet.get('created_at'),
                            'label': label_type,
                            'followers_count': user.get('followers_count'),
                            'friends_count': user.get('friends_count'),
                            'verified': user.get('verified'),
                            'retweet_count': tweet.get('retweet_count'),
                            'favorite_count': tweet.get('favorite_count')
                        })
                    except Exception as e:
                        print(f"Error reading {json_file}: {e}")

    df = pd.DataFrame(data)
    return df

In [78]:
pheme_ver_df = load_phemeveracity(r'D:\text datasets\text datasets\PHEME_veracity\all-rnr-annotated-threads')
pheme_ver_df.head()

Unnamed: 0,id,text,created_at,label,followers_count,friends_count,verified,retweet_count,favorite_count
0,552783238415265792,"Breaking: At least 10 dead, 5 injured after tO...",Wed Jan 07 11:06:08 +0000 2015,rumours,1628,246,False,159,14
1,552783667052167168,France: 10 people dead after shooting at HQ of...,Wed Jan 07 11:07:51 +0000 2015,rumours,129573,337,True,486,38
2,552783745565347840,Ten killed in shooting at headquarters of Fren...,Wed Jan 07 11:08:09 +0000 2015,rumours,529882,3051,True,127,15
3,552784168849907712,BREAKING: 10 dead in shooting at headquarters ...,Wed Jan 07 11:09:50 +0000 2015,rumours,499741,31,True,105,15
4,552784526955806720,Reuters: 10 people shot dead at headquarters o...,Wed Jan 07 11:11:16 +0000 2015,rumours,1377384,6,True,412,32


In [43]:
pheme_ver_df.label.unique()

array(['rumours', 'non-rumours'], dtype=object)

In [44]:
pheme_ver_df.label.value_counts()

label
non-rumours    4023
rumours        2402
Name: count, dtype: int64

In [97]:
def load_phemeveracity_reposts(main_folder):
    data = []

    # Traverse event folders
    for event_name in os.listdir(main_folder):
        event_path = os.path.join(main_folder, event_name)
        if not os.path.isdir(event_path):
            continue

        for label_type in ['rumours', 'non-rumours']:
            label_path = os.path.join(event_path, label_type)
            if not os.path.isdir(label_path):
                continue

            for tweet_folder in os.listdir(label_path):
                tweet_dir = os.path.join(label_path, tweet_folder)
                reactions_dir = os.path.join(tweet_dir, 'reactions')

                if not os.path.exists(reactions_dir):
                    continue

                for json_file in glob(os.path.join(reactions_dir, '*.json')):
                    try:
                        with open(json_file, 'r', encoding='utf-8') as f:
                            tweet = json.load(f)

                        parent_id = tweet.get("in_reply_to_status_id")
                        if parent_id is None:
                            continue

                        data.append({
                            'id': str(parent_id),  # original tweet ID
                            'text': tweet.get('text', ''),
                            'label': label_type
                        })

                    except Exception as e:
                        print(f"Error reading {json_file}: {e}")

    return pd.DataFrame(data)

In [98]:
pheme_ver_repost = load_phemeveracity_reposts(r'E:\social media datasets\pheme_veracity\all-rnr-annotated-threads')
pheme_ver_repost.head()

Unnamed: 0,id,text,label
0,552783238415265792,@H_E_Samuel @George_Berridge @michael_taggart ...,rumours
1,552783238415265792,@H_E_Samuel Hi Henry would you be willing to g...,rumours
2,552783238415265792,@H_E_Samuel @H_E_Samuel please call them terro...,rumours
3,552783238415265792,@H_E_Samuel French govt needs to take strict a...,rumours
4,552787794503143424,@EdwardBowden @H_E_Samuel @George_Berridge @mi...,rumours


In [99]:
pheme_ver_repost.label.value_counts()

label
non-rumours    67462
rumours        30450
Name: count, dtype: int64

# 7. RumorEval17


In [79]:
def load_rumoureval17_dataset(root_path):
    data = []

    # Load label mappings from train and dev
    label_path = os.path.join(root_path, "traindev")
    with open(os.path.join(label_path, "rumoureval-subtaskB-train.json"), "r", encoding="utf-8") as f:
        train_labels = json.load(f)
    with open(os.path.join(label_path, "rumoureval-subtaskB-dev.json"), "r", encoding="utf-8") as f:
        dev_labels = json.load(f)

    all_labels = {**train_labels, **dev_labels}  # merge dictionaries

    threads_root = os.path.join(root_path, "rumoureval-data")

    for event in os.listdir(threads_root):
        event_path = os.path.join(threads_root, event)
        if not os.path.isdir(event_path):
            continue

        for tweet_folder in os.listdir(event_path):
            tweet_path = os.path.join(event_path, tweet_folder)
            if not os.path.isdir(tweet_path):
                continue

            try:
                source_tweet_dir = os.path.join(tweet_path, "source-tweet")
                tweet_files = os.listdir(source_tweet_dir)
                if not tweet_files:
                    continue

                tweet_file = os.path.join(source_tweet_dir, tweet_files[0])

                with open(tweet_file, 'r', encoding='utf-8') as f:
                    tweet = json.load(f)

                tweet_id = tweet.get("id_str", tweet.get("id"))
                label = all_labels.get(tweet_id, "unknown")

                data.append({
                    "id": tweet_id,
                    "text": tweet.get("text"),
                    "created_at": tweet.get("created_at"),
                    "label": label,
                    "name": tweet.get("user", {}).get("name"),
                    "user_followers_count": tweet.get("user", {}).get("followers_count"),
                    "user_friends_count": tweet.get("user", {}).get("friends_count"),
                    "user_verified": tweet.get("user", {}).get("verified"),
                    "retweet_count": tweet.get("retweet_count"),
                    "favorite_count": tweet.get("favorite_count")
                })

            except Exception as e:
                print(f"Error in {tweet_path}: {e}")

    return pd.DataFrame(data)

In [80]:
rumoreval17_df = load_rumoureval17_dataset(r"D:\text datasets\text datasets\RumorEval17\semeval2017-task8-dataset")
rumoreval17_df.head()

Unnamed: 0,id,text,created_at,label,name,user_followers_count,user_friends_count,user_verified,retweet_count,favorite_count
0,552783667052167168,France: 10 people dead after shooting at HQ of...,Wed Jan 07 11:07:51 +0000 2015,True,euronews,129573,337,True,486,38
1,552785375161499649,BREAKING: 10 reportedly shot dead at Paris HQ ...,Wed Jan 07 11:14:38 +0000 2015,True,The Independent,972167,1763,True,128,5
2,552791196247269378,BREAKING: At least 10 killed in shooting at Fr...,Wed Jan 07 11:37:46 +0000 2015,True,CNN International,3029912,389,True,295,78
3,552791578893619200,Eleven dead in shooting at Paris offices of sa...,Wed Jan 07 11:39:17 +0000 2015,True,The Guardian,3091451,1083,True,338,28
4,552792544132997121,BREAKING Charlie Hebdo latest: 11 dead 10 woun...,Wed Jan 07 11:43:07 +0000 2015,True,The Independent,973212,1763,True,203,32


In [15]:
rumoreval17_df.label.unique()

array(['true', 'false', 'unverified'], dtype=object)

In [16]:
rumoreval17_df.label.value_counts()

label
true          137
unverified     98
false          62
Name: count, dtype: int64

In [21]:
def load_rumoureval17_reposts(root_path):
    data = []

    # Load label mappings
    label_path = os.path.join(root_path, "traindev")
    with open(os.path.join(label_path, "rumoureval-subtaskB-train.json"), "r", encoding="utf-8") as f:
        train_labels = json.load(f)
    with open(os.path.join(label_path, "rumoureval-subtaskB-dev.json"), "r", encoding="utf-8") as f:
        dev_labels = json.load(f)

    all_labels = {**train_labels, **dev_labels}

    threads_root = os.path.join(root_path, "rumoureval-data")

    for event in os.listdir(threads_root):
        event_path = os.path.join(threads_root, event)
        if not os.path.isdir(event_path):
            continue

        for tweet_folder in os.listdir(event_path):
            tweet_path = os.path.join(event_path, tweet_folder)
            if not os.path.isdir(tweet_path):
                continue

            reactions_dir = os.path.join(tweet_path, "replies")
            if not os.path.isdir(reactions_dir):
                continue

            for file in os.listdir(reactions_dir):
                if not file.endswith('.json'):
                    continue

                file_path = os.path.join(reactions_dir, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        tweet = json.load(f)

                    parent_id = tweet.get("in_reply_to_status_id")
                    if parent_id is None:
                        continue

                    label = all_labels.get(str(parent_id), "unknown")

                    data.append({
                        "id": str(parent_id),
                        "text": tweet.get("text", ""),
                        "label": label
                    })

                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

    return pd.DataFrame(data)

In [22]:
rumoreval19_reposts = load_rumoureval17_reposts(r"D:\text datasets\text datasets\RumorEval17\semeval2017-task8-dataset")
rumoreval19_reposts.head()

Unnamed: 0,id,text,label
0,552783667052167168,MT @euronews France: 10 dead after shooting at...,true
1,552785374507175936,@j0nathandavis They who? Stupid and partial op...,unknown
2,552786226546495488,"@nanoSpawn Socialists, Antisemites, anti zioni...",unknown
3,552783667052167168,@euronews @TradeDesk_Steve A French crime of p...,true
4,552783667052167168,"@euronews LOL. 5 million Muslims in France, wh...",true


In [23]:
rumoreval19_reposts.label.value_counts()

label
unknown       1620
true          1173
unverified     981
false          448
Name: count, dtype: int64

# 8. RumorEval19

In [81]:
def load_rumoureval2019_dataset(root_path):
    data = []

    # Load label mappings
    with open(os.path.join(root_path, "train-key.json"), "r", encoding="utf-8") as f:
        train_labels = json.load(f).get("subtaskbenglish", {})

    with open(os.path.join(root_path, "dev-key.json"), "r", encoding="utf-8") as f:
        dev_labels = json.load(f).get("subtaskbenglish", {})

    all_labels = {**train_labels, **dev_labels}

    for dataset_folder in ["reddit-dev-data", "reddit-training-data", "twitter-english"]:
        dataset_path = os.path.join(root_path, dataset_folder)
        if not os.path.isdir(dataset_path):
            continue

        for event in os.listdir(dataset_path):
            event_path = os.path.join(dataset_path, event)
            if not os.path.isdir(event_path):
                continue

            for thread_folder in os.listdir(event_path):
                thread_path = os.path.join(event_path, thread_folder)
                if not os.path.isdir(thread_path):
                    continue

                # Only look for the source tweet (not replies or nested paths)
                source_tweet_path = os.path.join(thread_path, "source-tweet")
                if not os.path.isdir(source_tweet_path):
                    continue

                try:
                    tweet_files = os.listdir(source_tweet_path)
                    if not tweet_files:
                        continue

                    # There should only be one file in source-tweet folder
                    tweet_file = os.path.join(source_tweet_path, tweet_files[0])
                    with open(tweet_file, "r", encoding="utf-8") as f:
                        tweet = json.load(f)

                    tweet_id = tweet.get("id_str", tweet.get("id"))
                    label = all_labels.get(tweet_id, "unknown")

                    data.append({
                        "post_id": tweet_id,
                        "post_text": tweet.get("text"),
                        "timestamp": tweet.get("created_at"),
                        "label": label,
                        "username": tweet.get("user", {}).get("name"),
                        "num_followers": tweet.get("user", {}).get("followers_count"),
                        "num_friends": tweet.get("user", {}).get("friends_count"),
                        "is_verified": tweet.get("user", {}).get("verified"),
                        "num_retweets": tweet.get("retweet_count")
                    })

                except Exception as e:
                    print(f"Error reading {source_tweet_path}: {e}")

    return pd.DataFrame(data)

In [25]:
rumoreval19_df = load_rumoureval2019_dataset(r"D:\text datasets\text datasets\rumoureval2019\rumoureval2019\rumoureval-2019-training-data\rumoureval-2019-training-data")
rumoreval19_df.head()

Unnamed: 0,post_id,post_text,timestamp,label,username,num_followers,num_friends,is_verified,num_retweets
0,552783667052167168,France: 10 people dead after shooting at HQ of...,Wed Jan 07 11:07:51 +0000 2015,True,euronews,129573,337,True,486
1,552785375161499649,BREAKING: 10 reportedly shot dead at Paris HQ ...,Wed Jan 07 11:14:38 +0000 2015,True,The Independent,972167,1763,True,128
2,552788945017516032,Appalled by the attack on Charlie Hebdo in Par...,Wed Jan 07 11:28:49 +0000 2015,True,Tom Newton Dunn,19332,899,False,166
3,552791196247269378,BREAKING: At least 10 killed in shooting at Fr...,Wed Jan 07 11:37:46 +0000 2015,True,CNN International,3029912,389,True,295
4,552791578893619200,Eleven dead in shooting at Paris offices of sa...,Wed Jan 07 11:39:17 +0000 2015,True,The Guardian,3091451,1083,True,338


In [26]:
rumoreval19_df.label.unique()

array(['true', 'false', 'unverified'], dtype=object)

In [27]:
rumoreval19_df.label.value_counts()

label
true          145
unverified    106
false          74
Name: count, dtype: int64

In [28]:
def load_rumoureval2019_reposts(root_path):
    data = []

    # Load labels from train and dev keys
    with open(os.path.join(root_path, "train-key.json"), "r", encoding="utf-8") as f:
        train_labels = json.load(f).get("subtaskbenglish", {})

    with open(os.path.join(root_path, "dev-key.json"), "r", encoding="utf-8") as f:
        dev_labels = json.load(f).get("subtaskbenglish", {})

    all_labels = {**train_labels, **dev_labels}

    for dataset_folder in ["reddit-dev-data", "reddit-training-data", "twitter-english"]:
        dataset_path = os.path.join(root_path, dataset_folder)
        if not os.path.isdir(dataset_path):
            continue

        for event in os.listdir(dataset_path):
            event_path = os.path.join(dataset_path, event)
            if not os.path.isdir(event_path):
                continue

            for thread_folder in os.listdir(event_path):
                thread_path = os.path.join(event_path, thread_folder)
                if not os.path.isdir(thread_path):
                    continue

                replies_path = os.path.join(thread_path, "replies")
                if not os.path.isdir(replies_path):
                    continue

                for reply_file in os.listdir(replies_path):
                    if not reply_file.endswith('.json'):
                        continue

                    reply_path = os.path.join(replies_path, reply_file)
                    try:
                        with open(reply_path, "r", encoding="utf-8") as f:
                            tweet = json.load(f)

                        parent_id = tweet.get("in_reply_to_status_id_str") or tweet.get("in_reply_to_status_id")
                        if parent_id is None:
                            continue

                        label = all_labels.get(str(parent_id), "unknown")

                        data.append({
                            "id": str(parent_id),
                            "text": tweet.get("text", ""),
                            "label": label
                        })

                    except Exception as e:
                        print(f"Error reading {reply_path}: {e}")

    return pd.DataFrame(data)

In [30]:
rumoreval19_repost_df = load_rumoureval2019_reposts(r"D:\text datasets\text datasets\rumoureval2019\rumoureval2019\rumoureval-2019-training-data\rumoureval-2019-training-data")
rumoreval19_repost_df.head()

Unnamed: 0,id,text,label
0,552783667052167168,MT @euronews France: 10 dead after shooting at...,true
1,552785374507175936,@j0nathandavis They who? Stupid and partial op...,unknown
2,552786226546495488,"@nanoSpawn Socialists, Antisemites, anti zioni...",unknown
3,552783667052167168,@euronews @TradeDesk_Steve A French crime of p...,true
4,552783667052167168,"@euronews LOL. 5 million Muslims in France, wh...",true


In [31]:
rumoreval19_repost_df.label.unique()

array(['true', 'unknown', 'false', 'unverified'], dtype=object)

# 9- Social-Honeypot

In [40]:
def load_tweet_dataset(main_folder):
    # File mappings: filename -> label
    file_map = {
        'content_polluters_tweets.txt': 'polluter',
        'legitimate_users_tweets.txt': 'legitimate',
        'content_polluters.txt': 'polluter',
        'legitimate_users.txt': 'legitimate'
    }

    # Read user profiles (no headers)
    user_profiles = []
    for profile_file in ['content_polluters.txt', 'legitimate_users.txt']:
        path = os.path.join(main_folder, profile_file)
        df = pd.read_csv(path, sep='\t', header=None,
                         names=['UserID', 'ProfileCreatedAt', 'ProfileCollectedAt',
                                'NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets',
                                'ScreenNameLength', 'DescriptionLength'])
        df['Label'] = file_map[profile_file]
        user_profiles.append(df)
    users_df = pd.concat(user_profiles, ignore_index=True)

    # Read tweets (no headers)
    tweet_data = []
    for tweet_file in ['content_polluters_tweets.txt', 'legitimate_users_tweets.txt']:
        path = os.path.join(main_folder, tweet_file)
        df = pd.read_csv(path, sep='\t', header=None,
                         names=['UserID', 'TweetID', 'TweetText', 'CreatedAt'])
        df['Label'] = file_map[tweet_file]
        tweet_data.append(df)
    tweets_df = pd.concat(tweet_data, ignore_index=True)

    # Merge tweets with user profiles on UserID and Label
    merged_df = tweets_df.merge(users_df, on=['UserID', 'Label'])

    # Select final columns
    final_df = merged_df[['TweetID', 'TweetText', 'CreatedAt', 'NumberOfFollowers', 'NumberOfFollowings', 'Label']]

    return final_df

In [41]:
main_folder = r"D:\text datasets\text datasets\Social-Honeypot\social_honeypot_icwsm_2011"
socialhoneypot = load_tweet_dataset(main_folder)
socialhoneypot.head()

Unnamed: 0,TweetID,TweetText,CreatedAt,NumberOfFollowers,NumberOfFollowings,Label
0,5599519501,MELBOURNE ENQUIRY: Seeking a variety of acts f...,2009-11-10 15:14:31,3071,3269,polluter
1,5600313663,THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...,2009-11-10 15:46:05,3071,3269,polluter
2,5600328557,THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...,2009-11-10 15:46:40,3071,3269,polluter
3,5600338093,THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...,2009-11-10 15:47:03,3071,3269,polluter
4,5600564863,"Come to ""The Burlesque Bootcamp - Sydney"" Satu...",2009-11-10 15:56:03,3071,3269,polluter


In [43]:
socialhoneypot.Label.value_counts()

Label
legitimate    3246377
polluter      2333691
Name: count, dtype: int64

# 10. Twitter

In [6]:
def load_twitter(folder_path):
    # Define columns for each file
    post_cols_dev = ['post_id', 'post_text', 'user_id', 'image_id', 'username', 'timestamp', 'label']
    post_cols_test = ['post_id', 'post_text', 'user_id', 'username', 'image_id', 'timestamp']

    user_cols = ['post_id', 'num_friends', 'num_followers', 'folfriend_ratio', 'times_listed', 'has_url', 'is_verified', 'num_posts']
    post_feat_cols = ['post_id', 'num_words', 'text_length', 'contains_questmark', 'num_questmark',
                      'contains_exclammark', 'num_exclammark', 'contains_happyemo', 'contains_sademo',
                      'contains_firstorderpron', 'contains_secondorderpron', 'contains_thirdorderpron',
                      'num_uppercasechars', 'num_possentiwords', 'num_negsentiwords', 'num_mentions',
                      'num_hashtags', 'num_URLs', 'num_retweets']

    # Load dev set
    dev_path = os.path.join(folder_path, 'devset')
    posts_dev = pd.read_csv(os.path.join(dev_path, 'posts.txt'), sep='\t', names=post_cols_dev, header=0)
    users_dev = pd.read_csv(os.path.join(dev_path, 'user_features.txt'), sep=',', names=user_cols, header=0)
    feats_dev = pd.read_csv(os.path.join(dev_path, 'post_features.txt'), sep=',', names=post_feat_cols, header=0)

    # Merge dev
    dev = posts_dev.merge(users_dev, on='post_id', how='left')
    dev = dev.merge(feats_dev[['post_id', 'num_retweets']], on='post_id', how='left')

    # Load test set
    test_path = os.path.join(folder_path, 'testset')
    posts_test = pd.read_csv(os.path.join(test_path, 'posts.txt'), sep='\t', names=post_cols_test, header=0)
    posts_test["label"] = None  # No label in test set

    users_test = pd.read_csv(os.path.join(test_path, 'user_features.txt'), sep=',', names=user_cols, header=0)
    feats_test = pd.read_csv(os.path.join(test_path, 'post_features.txt'), sep=',', names=post_feat_cols, header=0)

    # Merge test
    test = posts_test.merge(users_test, on='post_id', how='left')
    test = test.merge(feats_test[['post_id', 'num_retweets']], on='post_id', how='left')

    # Concatenate dev and test
    df = pd.concat([dev, test], ignore_index=True)

    # Select only needed columns
    final_df = df[['post_id', 'post_text', 'timestamp', 'label', 'username',
                   'num_followers', 'num_friends', 'is_verified', 'num_retweets']]

    return final_df

In [7]:
twitter_df = load_twitter(r'D:\text+image\text+image\twitter')
twitter_df.head()

Unnamed: 0,post_id,post_text,timestamp,label,username,num_followers,num_friends,is_verified,num_retweets
0,324597532548276224,Don't need feds to solve the #bostonbombing wh...,Wed Apr 17 18:57:37 +0000 2013,fake,SantaCruzShred,634,1445,False,0.0
1,325145334739267584,PIC: Comparison of #Boston suspect Sunil Tripa...,Fri Apr 19 07:14:23 +0000 2013,fake,Oscar_Wang,271,565,False,0.0
2,325152091423248385,I'm not completely convinced that it's this Su...,Fri Apr 19 07:41:14 +0000 2013,fake,jamwil,649,576,False,0.0
3,324554646976868352,Brutal lo que se puede conseguir en colaboraci...,Wed Apr 17 16:07:12 +0000 2013,fake,rubenson80,297,546,False,0.0
4,324315545572896768,4chan and the bombing. just throwing it out th...,Wed Apr 17 00:17:06 +0000 2013,fake,Slimlenny,60,61,False,0.0


In [8]:
twitter_df.label.value_counts()

label
fake    9404
real    6226
Name: count, dtype: int64

In [9]:
twitter_df.label.unique()

array(['fake', 'real', None], dtype=object)

# 11. Weibo

In [5]:
def load_weibo_dataset(folder_path):
    # Files that include rumor/nonrumor data
    label_files = {
        'train_rumor.txt': 'rumor',
        'train_nonrumor.txt': 'nonrumor',
        'test_rumor.txt': 'rumor',
        'test_nonrumor.txt': 'nonrumor'
    }

    all_data = []

    for file_name, label in label_files.items():
        file_path = os.path.join(folder_path, file_name)

        if not os.path.exists(file_path):
            print(f"Warning: {file_name} not found in {folder_path}")
            continue

        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        for i in range(0, len(lines), 3):
            if i + 2 >= len(lines):
                continue  # Skip incomplete entries

            meta = lines[i].strip().split('|')
            tweet_text = lines[i + 2].strip()

            if len(meta) < 15:  # Basic sanity check
                continue

            all_data.append({
                'tweet_id': meta[0],
                'user_name': meta[1],
                'publish_time': meta[4],
                'user_auth_type': meta[10],
                'user_fans_count': meta[11],
                'user_follow_count': meta[12],
                'retweet_count': meta[6],
                'praise_count': meta[8],
                'tweet_content': tweet_text,
                'label': label
            })

    # Create base DataFrame
    df = pd.DataFrame(all_data)

    # Convert appropriate columns to numeric
    numeric_cols = ['user_fans_count', 'user_follow_count', 'retweet_count', 'praise_count']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    return df

In [6]:
weibo_df = load_weibo_dataset(r'D:\text+image\text+image\Weibo-dataset-main\Weibo-dataset-main')
weibo_df.head()

Unnamed: 0,tweet_id,user_name,publish_time,user_auth_type,user_fans_count,user_follow_count,retweet_count,praise_count,tweet_content,label
0,3511947309647762,地球超级爆料,2012-11-13 16:55,0,5047.0,1770.0,79,0,震惊，转发求证：【想都不敢想 ，在美国一桶金龙鱼食用油只要8元人民币】 一桶食用油相当于中国...,rumor
1,3576100079039606,Noodles_Liu,2013-05-09 17:36,0,9049.0,490.0,1,0,【法院无底线】湖南长沙一位小朋友上学路上捡到3万元，原地不动等失主，结果被人冒领。不知情的孩...,rumor
2,3899073935617462,淡然一夏02,2015-10-17 23:18,0,8.0,82.0,0,0,"转发:我校需要小孩的衣服,新旧不限!西藏阿里地区是世界海拔最高的地区请问周围有没有四到十岁孩...",rumor
3,3584521306131914,凤凰吴氏制茶,2013-06-01 23:19,0,3362.0,1908.0,5,0,"立刻检查一下你家里的牙膏，如果是黑色条马上扔掉！ 大家买膏请留心,买牙膏时注意牙膏管反面 底...",rumor
4,3553661986467439,咩咩百分百,2013-03-08 19:35,0,32.0,20.0,0,0,快快转发，急急急！中央电视台《焦点访谈》已经播出，可口可乐承认旗下(果粒橙)含有美国禁用农药...,rumor


In [7]:
print(weibo_df['user_auth_type'].dtype)

object


# 12. Weibo-Rumor

In [46]:
def load_weibo_rumor_dataset(root_path):
    data = []

    txt_path = os.path.join(root_path, "Weibo.txt")
    json_dir = os.path.join(root_path, "Weibo")

    with open(txt_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue

            event_id_part = parts[0]  # e.g., "eid:10031080900"
            label_part = parts[1]     # e.g., "label:0"

            event_id = event_id_part.replace("eid:", "")
            label = int(label_part.replace("label:", ""))

            json_path = os.path.join(json_dir, f"{event_id}.json")
            if not os.path.isfile(json_path):
                continue

            try:
                with open(json_path, 'r', encoding='utf-8') as jf:
                    posts = json.load(jf)

                # Find the post where id == event_id
                root_post = next((post for post in posts if str(post.get("id")) == event_id), None)
                if not root_post:
                    continue

                data.append({
                    "id": root_post.get("id"),
                    "original_text": root_post.get("original_text", ""),
                    "username": root_post.get("username", ""),
                    "followers_count": root_post.get("followers_count", 0),
                    "friends_count": root_post.get("friends_count", 0),
                    "verified": root_post.get("verified", False),
                    "reposts_count": root_post.get("reposts_count", 0),
                    "favourites_count": root_post.get("favourites_count", 0),
                    "label": label
                })

            except Exception as e:
                print(f"Error reading {json_path}: {e}")

    return pd.DataFrame(data)

In [47]:
weibo_rumor_df = load_weibo_rumor_dataset(r"D:\text datasets\text datasets\weibo rumor")
weibo_rumor_df.head()

Unnamed: 0,id,original_text,username,followers_count,friends_count,verified,reposts_count,favourites_count,label
0,10031080900,毛新宇 深情演唱新作《献给爷爷奶奶的歌》 http://t.cn/h1R0Rj,历史震惊你,1011418,1273,False,267,6,0
1,10031994215,如果没有人相信你，那就自己相信自己；如果没人欣赏你，那就自己欣赏自己；如果没人祝福你，那就自...,微博经典语录,3591890,250,False,1624,1,0
2,10276391917,看了新闻感慨：大运会火炬，怎么跟烤土豆片儿似的，中间还有彩椒和洋葱 ~,全球热门收集,10936954,925,False,546,48,0
3,10313557537,浮躁风气较盛的今天，并不是没有认真做事的人。但是，“认真”的利益选择性和目的导向性，使很多表...,李纲,131460,199,True,402,162,0
4,10402071863,[心] 女孩身上某个器官，爸爸碰两次，男朋友碰一次，老公一次都不能碰。大家猜猜是答案是什么~~,重口味腐女营,1212152,0,False,1453,13,0


In [48]:
weibo_rumor_df.label.unique()

array([0, 1], dtype=int64)

In [49]:
weibo_rumor_df.label.value_counts()

label
0    2351
1    2313
Name: count, dtype: int64

In [50]:
def load_weibo_rumor_reposts(root_path):
    data = []

    txt_path = os.path.join(root_path, "Weibo.txt")
    json_dir = os.path.join(root_path, "Weibo")

    with open(txt_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue

            event_id_part = parts[0]  # e.g., "eid:4010312877"
            label_part = parts[1]     # e.g., "label:0"
            event_id = event_id_part.replace("eid:", "")
            label = int(label_part.replace("label:", ""))

            json_path = os.path.join(json_dir, f"{event_id}.json")
            if not os.path.isfile(json_path):
                continue

            try:
                with open(json_path, 'r', encoding='utf-8') as jf:
                    posts = json.load(jf)

                for post in posts:
                    # Skip the root post (id == event_id)
                    if str(post.get("id")) == event_id:
                        continue

                    data.append({
                        "id": post.get("id"),
                        "text": post.get("text", ""),
                        "label": label
                    })

            except Exception as e:
                print(f"Error reading {json_path}: {e}")

    return pd.DataFrame(data)

In [51]:
weibo_reposts_df = load_weibo_rumor_reposts(r"D:\text datasets\text datasets\weibo rumor")
weibo_reposts_df.head()

Unnamed: 0,id,text,label
0,10031139424,我工作，大家先看个乐儿吧，看我有感觉的请自觉汇报感受。,0
1,10031149936,[汗],0
2,10031160544,笑而不语。。。。,0
3,10031160846,在我看来，力推毛新宇，就是打压太子党的一种温柔方式。,0
4,10031161682,转发微博。,0


In [52]:
weibo_reposts_df.label.value_counts()

label
1    2088430
0    1712562
Name: count, dtype: int64

# Read Parquet Files

In [82]:
import pandas as pd

file_path = r'D:\Social-media-dataset-merger\keeup-social-media-datasets-merger\processed_data\all_originals.parquet'

df = pd.read_parquet(file_path)

df.head()

Unnamed: 0,post_id,text,timestamp,label,username,follower_count,friends_count,is_verified,repost_count,likes,language,domain,platform
0,552783667052167168,france 10 people dead after shooting at hq of ...,2015-01-07 11:07:51+00:00,real,euronews,129573.0,337.0,True,486,38,en,others,Twitter
1,552785375161499649,breaking 10 reportedly shot dead at paris hq o...,2015-01-07 11:14:38+00:00,real,The Independent,972167.0,1763.0,True,128,5,en,others,Twitter
2,552791196247269378,breaking at least 10 killed in shooting at fre...,2015-01-07 11:37:46+00:00,real,CNN International,3029912.0,389.0,True,295,78,en,others,Twitter
3,552791578893619200,eleven dead in shooting at paris offices of sa...,2015-01-07 11:39:17+00:00,real,The Guardian,3091451.0,1083.0,True,338,28,en,others,Twitter
4,552792544132997121,breaking charlie hebdo latest 11 dead 10 wound...,2015-01-07 11:43:07+00:00,real,The Independent,973212.0,1763.0,True,203,32,en,others,Twitter
