# Put the data in Google Drive mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Checking working directory

In [None]:
import os

# print current working directory
print(os.getcwd())

/content/drive/MyDrive


In [None]:
import os
os.path.exists('/content/drive/MyDrive')

True

In [None]:
# Switch to a subdirectory under the Google Drive mount point
%cd /content/drive/MyDrive

/content/drive/MyDrive


# Checking Runtime Type

In [None]:
import torch

# check if the GPU is using
print("CUDA available:", torch.cuda.is_available())
print("GPU name：", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

# list GPU
!nvidia-smi

# check the current working path
!ls -lh

# Version 1


## Start working

In [None]:
import os
from google.colab import drive
import pandas as pd

# Define the expected mount point
MOUNT_POINT = '/content/drive/MyDrive'

# Check if Google Drive is already mounted
if not os.path.exists(MOUNT_POINT):
    print("Google Drive not mounted. Mounting now...\n" )
    drive.mount('/content/drive')
else:
    print(f"Google Drive already mounted at {MOUNT_POINT}\n")

# Check current working directory and change it to the mount point if necessary
current_path = os.getcwd()

if current_path != MOUNT_POINT:
    print(f"Current working directory：{current_path}，will be changed to {MOUNT_POINT}\n")
    os.chdir(MOUNT_POINT)
else:
    print(f"Current working directory is already at：{MOUNT_POINT}\n")

# Confirm the final working directory
print("The final working directory：", os.getcwd())

# Loading cache data
df_train = pd.read_pickle('df_train.pkl')
df_false = pd.read_pickle('df_false.pkl')

Google Drive not mounted. Mounting now...

Mounted at /content/drive
Current working directory：/content，will be changed to /content/drive/MyDrive

The final working directory： /content/drive/MyDrive


## Process

In [None]:
# 检查列名
print(df_train.columns)

# 检查缺失值
print("🔍 缺失值：\n", df_train.isnull().sum())

# 检查重复文本
print("🔁 重复推文数：", df_train.duplicated(subset='text_clean').sum())

# 文本长度统计
df_train['text_length'] = df_train['text_clean'].apply(lambda x: len(str(x).split()))
print("📏 文本长度分布：\n", df_train['text_length'].describe())

# 如果有 label 列，检查类别分布
if 'label' in df_train.columns:
    print("📊 类别分布：\n", df_train['label'].value_counts())
else:
    print("⚠️ 没有 label 列，请确认是否另存在标签文件。")


Index(['username', 'created_at', 'text_clean'], dtype='object')
🔍 缺失值：
 username      0
created_at    0
text_clean    2
dtype: int64
🔁 重复推文数： 69133
📏 文本长度分布：
 count    1.048000e+06
mean     1.509275e+01
std      8.514645e+00
min      1.000000e+00
25%      9.000000e+00
50%      1.400000e+01
75%      1.900000e+01
max      5.800000e+01
Name: text_length, dtype: float64
⚠️ 没有 label 列，请确认是否另存在标签文件。


In [None]:
from google.colab import files

# upload the file
uploaded = files.upload()

Saving Fakenews.csv to Fakenews.csv


**Read data and cache as pickle**

In [None]:
import pandas as pd
df_train = pd.read_excel('Bereinigter_Datensatz.xlsx', engine='openpyxl')
df_false = pd.read_json('/content/drive/MyDrive/DefaktS_Twitter_DS.jsonl', lines=True)

In [None]:
df_train.to_pickle('/content/drive/MyDrive/df_train.pkl')
df_false.to_pickle('/content/drive/MyDrive/df_false.pkl')

Read Data

In [None]:
import pandas as pd

# load exel dataset
df = pd.read_json('DefaktS_Twitter_DS.jsonl', lines=True)
df_train = pd.read_excel('Bereinigter_Datensatz.xlsx')

Preprocessing

In [None]:
df_train

In [None]:
# filter the first 109 rows
df_train = df_train.head(109)

# convert into csv file
df_train.to_csv('train.csv', index=False, encoding="utf-8")

# show result
df_train

In [None]:
# load the fakenews jsonl file
df_false = pd.read_json("DefaktS_Twitter_DS.jsonl", lines=True)

df_false.head()

In [None]:
# filter the columns
cols = ['id','DateTime', 'text']
df_false = df_false[cols]

# check the number of rows
num_rows = df_false.shape[0]
print(f"\n Total rows：{num_rows}")

# check the result
df_false.head()

In [None]:
import re
# pre-processing the data

# convert DataTime into pandas datetime data type
df_false.loc[:, 'DateTime'] = pd.to_datetime(
    df_false['DateTime'],
    errors='coerce'
)

In [None]:
# define the cleaning function
def clean_tweet(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # lowercase all characters
    text = text.lower()
    # remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # remove @mentions
    text = re.sub(r'@\w+', '', text)
    # remove #, keep the text
    text = re.sub(r'#(\w+)', r'\1', text)
    # remove Emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # Smileys
        u"\U0001F300-\U0001F5FF"  # Symbole
        u"\U0001F680-\U0001F6FF"  # Transport
        u"\U0001F1E0-\U0001F1FF"  # Flaggen
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub("", text)
    # remove character that is not a letter, number or common German letter
    text = re.sub(r'[^0-9a-z\u4e00-\u9fffäöüß ]+', '', text)
    # strip leading/trailing whitespace
    text = text.strip()
    return text

# apply the cleaning function
df_false['text'] = df_false['text'].apply(clean_tweet)

# drop rows where cleaning failed or datetime conversion failed
df_false = df_false.dropna(subset=['text', 'DateTime'])

# remove duplicate tweets based on the cleaned text
df_false = df_false.drop_duplicates(subset=['text'])

# show result
df_false.head()

Unnamed: 0,id,DateTime,text
0,378394,2023-02-06 18:58:06,abtreibung ist nach der 13ten wo gleichbedeute...
1,378395,2023-02-06 16:30:08,in england wales schottland frankreich norwege...
2,378396,2023-02-06 15:01:22,wie wahr die eu fördert statt kinder und famil...
3,378397,2023-02-06 14:56:55,gegen abtreibung ehe nur zwischen mann und fra...
4,378398,2023-02-06 12:14:02,news spionageballons china robert habeck olaf ...


In [None]:
# convert to csv file
df_false.to_csv('Fakenews.csv', index=False, encoding="utf-8")

## Data training

In [None]:
# install package
!pip install -q sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a multilingual sentence embedding model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine the corpora for fitting the TF-IDF vectorizer
combined_texts = df_false['text'].tolist() + df_train['text_clean'].tolist()

In [None]:
# Load German stopwords via nltk
!pip install -q nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
german_stopwords = stopwords.words('german')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words=german_stopwords, max_features=5000)
tfidf_matrix = vectorizer.fit_transform(combined_texts)

In [None]:
# Split into TF-IDF matrices for fake news and tweets to classify
n_fake     = len(df_false)
fake_tfidf = tfidf_matrix[:n_fake]
train_tfidf= tfidf_matrix[n_fake:]

In [None]:
# Compute cosine similarity between each tweet and all fake-news entries
similarity_matrix = cosine_similarity(train_tfidf, fake_tfidf)

In [None]:
# For each tweet, take the maximum similarity score as its 'fake news match score'
max_scores = similarity_matrix.max(axis=1)

In [None]:
# Define labels based on thresholds:
#    similarity >= 0.7 → label as 'false'    (matches fake-news)
#    similarity <= 0.3 → label as 'true'     (likely true news)
#    otherwise         → label as 'uncertain'
def label_by_score(score, low=0.3, high=0.7):
    if score >= high:
        return 'false'
    elif score <= low:
        return 'true'
    else:
        return 'uncertain'

In [None]:
# Apply the labeling function and add to df_train
df_train.loc[:, 'prediction'] = [label_by_score(s) for s in max_scores]


In [None]:
print(df_train['prediction'].value_counts())
df_train

# Version 2

In [None]:
import pandas as pd

df_counts = pd.read_pickle('user_tweet_counts.pkl')
df_tweets = pd.read_pickle('more_than_30_tweets.pkl')
df = pd.read_pickle('df_cleaned.pkl')
df_news = pd.read_pickle('df_news.pkl')

Count the number of tweets per publisher

In [None]:
import os

folder = 'twitter-bundestag-2022'
user_tweet_counts = {}

for fname in os.listdir(folder):
    if fname.endswith(".jl"):
        path = os.path.join(folder, fname)
        with open(path, 'r', encoding='utf-8') as f:
            count = sum(1 for _ in f)  # Quickly count rows
            user = fname.replace('.jl', '')  # Username
            user_tweet_counts[user] = count

# Convert to pandas DataFrame
import pandas as pd

df_counts = pd.DataFrame(list(user_tweet_counts.items()), columns=['user', 'tweet_count'])
df_counts = df_counts.sort_values('tweet_count', ascending=False)

df_counts.head()

In [None]:
df_counts.to_pickle("user_tweet_counts.pkl")

Impoert Fake&True news data

In [None]:
import pandas as pd

# Load files
df_fake = pd.read_csv('real&fake/Fake.csv')
df_true = pd.read_csv('real&fake/True.csv')

# Add labels
df_fake['label'] = 0
df_true['label'] = 1

# Combine into one dataset
df_news = pd.concat([df_fake, df_true], ignore_index=True)

print(f"✅ Loaded {len(df_news)} total news articles:")
print(df_news['label'].value_counts().rename({0: 'Fake', 1: 'True'}))
df_news.head()


In [None]:
df_news.to_pickle("df_news.pkl")

Filter users with less than 30 tweets

In [None]:
import os
import json
import pandas as pd

# --- Configuration ---
folder_path = 'twitter-bundestag-2022'
X = 30  # Max tweets per user

# Step 1: Filter users with ≥ X tweets
valid_users = df_counts[df_counts['tweet_count'] >= X]['user'].tolist()

# Step 2: Load up to X qualified tweets per user
all_tweets = []

for user in valid_users:
    filepath = os.path.join(folder_path, user + ".jl")
    tweet_list = []

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                if len(tweet_list) >= X:
                    break

                try:
                    item = json.loads(line)
                    if isinstance(item, dict) and 'response' in item:
                        data = item['response'].get('data', [])
                        for tweet in data:
                            # Combined filtering conditions
                            if (
                                'text' in tweet
                                and not tweet['text'].startswith('RT @')   # remove retweets
                                and '@' not in tweet['text']              # remove replies/mentions
                                and tweet.get('lang') in ['de']     # only German
                            ):
                                tweet['user'] = user
                                tweet_list.append(tweet)
                                if len(tweet_list) >= X:
                                    break
                except json.JSONDecodeError:
                    continue  # Skip bad lines

        all_tweets.extend(tweet_list)

    except FileNotFoundError:
        print(f"File not found: {filepath}")
        continue

# Step 3: Combine into DataFrame
df_tweets = pd.DataFrame(all_tweets)

print(f"✅ Loaded {len(df_tweets)} tweets from {len(valid_users)} users (lang: de or en only).")
df_tweets.head()


In [None]:

# Save as pickle file
df_tweets.to_pickle("more_than_30_tweets.pkl")

print(f"Finished loading. Users loaded: {len(valid_users)}, Total tweets: {len(df_tweets)}")
df_tweets.head()


Simply Clean the data

In [None]:
import re

def clean_text_for_bert(text):
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove redundant whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

df_tweets['text_clean'] = df_tweets['text'].apply(clean_text_for_bert)


In [None]:
# filter unnecessary columns
columns_to_keep = ['text', 'text_clean', 'created_at', 'user', 'possibly_sensitive', 'public_metrics']
df = df_tweets[columns_to_keep]

In [None]:
# convert to pandas date type without timezone information
df['created_at'] = pd.to_datetime(df['created_at']).dt.tz_localize(None)


In [None]:
df.to_pickle("df_cleaned.pkl")

Clean news data

In [None]:
df_news.to_pickle("df_news.pkl")

In [None]:
df_news['full_text'] = df_news['title'] + ". " + df_news['text']

In [None]:
import re

def clean_for_bert(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_news['full_text'] = df_news['full_text'].apply(clean_for_bert)


Data training

In [None]:
!pip install -U sentence-transformers

In [None]:
df.head()

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')


In [None]:
# Extract text and labels
news_texts = df_news['full_text'].tolist()
news_labels = df_news['label'].tolist()  # 0 = Fake, 1 = True

# Encode in batches
news_embeddings = model.encode(news_texts, batch_size=64, show_progress_bar=True)


In [None]:
tweet_texts = df['text_clean'].tolist()
tweet_embeddings = model.encode(tweet_texts, batch_size=64, show_progress_bar=True)


In [None]:
import pickle

with open('news_embeddings.pkl', 'wb') as f:
    pickle.dump({'embeddings': news_embeddings, 'labels': news_labels}, f)

with open('tweet_embeddings.pkl', 'wb') as f:
    pickle.dump({'embeddings': tweet_embeddings, 'texts': tweet_texts}, f)


Assessment of mean similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Convert to numpy arrays if needed
tweet_embeddings = np.array(tweet_embeddings)
news_embeddings = np.array(news_embeddings)
news_labels = np.array(news_labels)

# Separate fake and true news vectors
fake_vectors = news_embeddings[news_labels == 0]
true_vectors = news_embeddings[news_labels == 1]

# ✅ Batch compute cosine similarity (vectorized)
similarity_fake = cosine_similarity(tweet_embeddings, fake_vectors)  # shape: (n_tweets, n_fake)
similarity_true = cosine_similarity(tweet_embeddings, true_vectors)  # shape: (n_tweets, n_true)

# ✅ Compute average similarity per tweet
avg_sim_fake = similarity_fake.mean(axis=1)  # shape: (n_tweets,)
avg_sim_true = similarity_true.mean(axis=1)

# ✅ Define threshold
threshold = 0.01  # You can experiment with 0.005, 0.02, etc.

# ✅ Vectorized label assignment
diff = np.abs(avg_sim_fake - avg_sim_true)
labels = np.where(
    diff < threshold, 'Unclear',
    np.where(avg_sim_fake > avg_sim_true, 'Fake', 'True')
)

# ✅ Save to your DataFrame
df['predicted_label'] = labels
df['sim_to_fake'] = avg_sim_fake
df['sim_to_true'] = avg_sim_true

# ✅ Preview results
print("✅ Label distribution:")
print(df['predicted_label'].value_counts())

df[['text', 'predicted_label', 'sim_to_fake', 'sim_to_true']].head()


In [None]:
df.to_pickle("df_predictions_cosine.pkl")

KNN Voting based on Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: 计算所有 tweet 与新闻之间的余弦相似度矩阵
similarity_matrix = cosine_similarity(tweet_embeddings, news_embeddings)  # shape: (num_tweets, num_news)

# Step 2: 获取每条推文最相似的 Top-K 新闻
K = 5  # 可调整为 3、10 等
top_k_indices = np.argsort(similarity_matrix, axis=1)[:, -K:]  # 每行取最后 K 个（最高相似度）

# Step 3: 投票 + 保存最相似新闻
predicted_labels_knn = []
similar_news_refs = []

for i, indices in enumerate(top_k_indices):
    top_labels = news_labels[indices]  # 获取这 K 个的标签
    top_texts = df_news.iloc[indices]['full_text'].tolist()  # 可选：保留原文以解释

    # 统计 fake 和 true 数量
    votes_fake = (top_labels == 0).sum()
    votes_true = (top_labels == 1).sum()

    # 判定逻辑
    if abs(votes_fake - votes_true) < 2:  # 差距太小，判为 Unclear
        label = "Unclear"
    elif votes_fake > votes_true:
        label = "Fake"
    else:
        label = "True"

    predicted_labels_knn.append(label)
    similar_news_refs.append(top_texts)

# Step 4: 保存进你的推文 DataFrame（df）
df['predicted_label_knn'] = predicted_labels_knn
df['top_similar_news'] = similar_news_refs

# ✅ 预览结果
print("✅ Prediction counts (Top-K Voting):")
print(df['predicted_label_knn'].value_counts())

df[['text', 'predicted_label_knn']].head()


In [None]:
df.to_pickle("df_predictions_knn.pkl")

In [None]:
df['predicted_label'].value_counts()


In [None]:
df['predicted_label_knn'].value_counts()


In [None]:
df.head()

In [None]:
# 你已有的变量
news_texts = df_news['full_text'].tolist()
news_labels = df_news['label'].tolist()
news_embeddings = model.encode(news_texts, batch_size=64, show_progress_bar=True)

# ✅ 修改后的保存
with open('news_embeddings.pkl', 'wb') as f:
    pickle.dump({
        'embeddings': news_embeddings,
        'labels': news_labels,
        'texts': news_texts  # ✅ 加入这一行
    }, f)


Batches:   0%|          | 0/702 [00:00<?, ?it/s]

In [77]:
df.to_csv('tweets.csv', index=False)


In [79]:
from google.colab import files
files.download('df_5.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [78]:
df_5 = df.head(5)

df_5.to_csv('df_5.csv', index=False)


In [82]:

df_top10 = df.groupby('user').head(10).reset_index(drop=True)
df_top10.to_csv('df_top10.csv', index=False)
files.download('df_top10.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>