<a href="https://colab.research.google.com/github/Pumpkin02/ML4B-project/blob/main/ML4B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Start working

In [8]:
import os
from google.colab import drive
import pandas as pd

# Define the expected mount point
MOUNT_POINT = '/content/drive/MyDrive'

# Check if Google Drive is already mounted
if not os.path.exists(MOUNT_POINT):
    print("Google Drive not mounted. Mounting now...\n" )
    drive.mount('/content/drive')
else:
    print(f"Google Drive already mounted at {MOUNT_POINT}\n")

# Check current working directory and change it to the mount point if necessary
current_path = os.getcwd()

if current_path != MOUNT_POINT:
    print(f"Current working directory：{current_path}，will be changed to {MOUNT_POINT}\n")
    os.chdir(MOUNT_POINT)
else:
    print(f"Current working directory is already at：{MOUNT_POINT}\n")

# Confirm the final working directory
print("The final working directory：", os.getcwd())

# Loading cache data
df_train = pd.read_pickle('df_train.pkl')
df_false = pd.read_pickle('df_false.pkl')

Google Drive already mounted at /content/drive/MyDrive

Current working directory is already at：/content/drive/MyDrive

The final working directory： /content/drive/MyDrive


# Put the data in Google Drive mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Checking working directory

In [4]:
import os

# print current working directory
print(os.getcwd())

/content/drive/MyDrive


In [None]:
# Switch to a subdirectory under the Google Drive mount point
%cd /content/drive/MyDrive

# Checking Runtime Type

In [None]:
import torch

# check if the GPU is using
print("CUDA available:", torch.cuda.is_available())
print("GPU name：", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

# list GPU
!nvidia-smi

# check the current working path
!ls -lh

# Data Preprocessing

In [1]:
from google.colab import files

# upload the file
uploaded = files.upload()

**Read data and cache as pickle**

In [5]:
import pandas as pd
df_train = pd.read_excel('Bereinigter_Datensatz.xlsx', engine='openpyxl')
df_false = pd.read_json('/content/drive/MyDrive/DefaktS_Twitter_DS.jsonl', lines=True)

In [6]:
df_train.to_pickle('/content/drive/MyDrive/df_train.pkl')
df_false.to_pickle('/content/drive/MyDrive/df_false.pkl')

Read Data

In [6]:
import pandas as pd

# load exel dataset
df = pd.read_json('DefaktS_Twitter_DS.jsonl', lines=True)
df_train = pd.read_excel('Bereinigter_Datensatz.xlsx')

Preprocessing

In [None]:
# filter the first 109 rows
df_train = df_train.head(109)

# convert into csv file
df_train.to_csv('train.csv', index=False, encoding="utf-8")

# show result
df_train

In [None]:
# load the fakenews jsonl file
df_false = pd.read_json("DefaktS_Twitter_DS.jsonl", lines=True)

df_false.head()

In [10]:
# filter the columns
cols = ['id','DateTime', 'text']
df_false = df_false[cols]

# check the number of rows
num_rows = df_false.shape[0]
print(f"\n Total rows：{num_rows}")

# check the result
df_false.head()


 Total rows：105855


Unnamed: 0,id,DateTime,text
0,378394,2023-02-06 18:58:06,Abtreibung ist nach der 13ten Wo. gleichbedeut...
1,378395,2023-02-06 16:30:08,"In #England, #Wales, #Schottland, #Frankreich,..."
2,378396,2023-02-06 15:01:22,Wie wahr; Die EU fördert statt Kinder und Fami...
3,378397,2023-02-06 14:56:55,"Gegen Abtreibung, Ehe nur zwischen Mann und Fr..."
4,378398,2023-02-06 12:14:02,"News: Spionage-Ballons, China, Robert Habeck, ..."


In [11]:
import re
# pre-processing the data

# convert DataTime into pandas datetime data type
df_false.loc[:, 'DateTime'] = pd.to_datetime(
    df_false['DateTime'],
    errors='coerce'
)

In [12]:
# define the cleaning function
def clean_tweet(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # lowercase all characters
    text = text.lower()
    # remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # remove @mentions
    text = re.sub(r'@\w+', '', text)
    # remove #, keep the text
    text = re.sub(r'#(\w+)', r'\1', text)
    # remove Emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # Smileys
        u"\U0001F300-\U0001F5FF"  # Symbole
        u"\U0001F680-\U0001F6FF"  # Transport
        u"\U0001F1E0-\U0001F1FF"  # Flaggen
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub("", text)
    # remove character that is not a letter, number or common German letter
    text = re.sub(r'[^0-9a-z\u4e00-\u9fffäöüß ]+', '', text)
    # strip leading/trailing whitespace
    text = text.strip()
    return text

# apply the cleaning function
df_false['text'] = df_false['text'].apply(clean_tweet)

# drop rows where cleaning failed or datetime conversion failed
df_false = df_false.dropna(subset=['text', 'DateTime'])

# remove duplicate tweets based on the cleaned text
df_false = df_false.drop_duplicates(subset=['text'])

# show result
df_false.head()

Unnamed: 0,id,DateTime,text
0,378394,2023-02-06 18:58:06,abtreibung ist nach der 13ten wo gleichbedeute...
1,378395,2023-02-06 16:30:08,in england wales schottland frankreich norwege...
2,378396,2023-02-06 15:01:22,wie wahr die eu fördert statt kinder und famil...
3,378397,2023-02-06 14:56:55,gegen abtreibung ehe nur zwischen mann und fra...
4,378398,2023-02-06 12:14:02,news spionageballons china robert habeck olaf ...


In [13]:
# convert to csv file
df_false.to_csv('Fakenews.csv', index=False, encoding="utf-8")

# Data training

In [15]:
# install package
!pip install -q sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m124.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a multilingual sentence embedding model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine the corpora for fitting the TF-IDF vectorizer
combined_texts = df_false['text'].tolist() + df_train['text_clean'].tolist()

In [25]:
# Load German stopwords via nltk
!pip install -q nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
german_stopwords = stopwords.words('german')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [26]:
# Fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words=german_stopwords, max_features=5000)
tfidf_matrix = vectorizer.fit_transform(combined_texts)

In [28]:
# Split into TF-IDF matrices for fake news and tweets to classify
n_fake     = len(df_false)
fake_tfidf = tfidf_matrix[:n_fake]
train_tfidf= tfidf_matrix[n_fake:]

In [29]:
# Compute cosine similarity between each tweet and all fake-news entries
similarity_matrix = cosine_similarity(train_tfidf, fake_tfidf)

In [30]:
# For each tweet, take the maximum similarity score as its 'fake news match score'
max_scores = similarity_matrix.max(axis=1)

In [31]:
# Define labels based on thresholds:
#    similarity >= 0.7 → label as 'false'    (matches fake-news)
#    similarity <= 0.3 → label as 'true'     (likely true news)
#    otherwise         → label as 'uncertain'
def label_by_score(score, low=0.3, high=0.7):
    if score >= high:
        return 'false'
    elif score <= low:
        return 'true'
    else:
        return 'uncertain'

In [34]:
# Apply the labeling function and add to df_train
df_train.loc[:, 'prediction'] = [label_by_score(s) for s in max_scores]


In [None]:
print(df_train['prediction'].value_counts())
df_train