In [1]:
from datasketch import MinHash, MinHashLSH
import pandas as pd

def get_minhash(text, num_perm=128):
    """
    Creează obiectul MinHash pentru un text dat.
    num_perm este numărul de permutări folosite pentru MinHash.
    """
    m = MinHash(num_perm=num_perm)
    # Împarte textul în cuvinte (folosim set() pentru a evita duplicările)
    for word in set(text.split()):
        m.update(word.encode('utf8'))
    return m

def find_near_duplicates_lsh(texts, threshold=0.7, num_perm=128):
    """
    Caută near-duplicates într-o listă de texte folosind LSH pe baza semnăturilor MinHash.
    - threshold: pragul de similaritate Jaccard pentru a considera două texte near duplicates.
    - num_perm: numărul de permutări pentru MinHash.
    
    Returnează o listă de tuple (i, j, score) unde textul i este near duplicate cu textul j,
    iar score este similaritatea Jaccard calculată.
    """
    # Inițializăm un index LSH cu pragul de similaritate dat
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    
    # Creăm și inserăm semnăturile MinHash pentru fiecare text
    minhashes = []
    for i, text in enumerate(texts):
        mh = get_minhash(text, num_perm)
        lsh.insert(i, mh)
        minhashes.append((i, mh))
    
    near_dups = []
    # Pentru fiecare semnătură, interogăm LSH pentru a găsi candidate near-duplicate
    for i, mh in minhashes:
        results = lsh.query(mh)
        for r in results:
            if r != i and r > i:  # evităm duplicatele și perechile inverse
                # Calculăm similaritatea Jaccard între semnătura curentă și cea a textului din candidate
                score = mh.jaccard(minhashes[r][1])
                # Dacă scorul este peste pragul definit, adăugăm perechea și scorul
                if score >= threshold:
                    near_dups.append((i, r, score))
    return near_dups


In [2]:

path = "../datasets/CC_MC_WELF_merged.csv"
df = pd.read_csv(path)
texts = df["text"].tolist()
df = df.dropna(subset=['text'])
# Păstrează doar rândurile unde "text" este string
df = df[df['text'].apply(lambda x: isinstance(x, str))]

texts = df['text'].tolist()


# Caută near duplicates cu pragul de similaritate Jaccard setat la 0.7
near_dup_all = find_near_duplicates_lsh(texts, threshold=0.9, num_perm=128)

print("Numărul de near-duplicates găsite:", len(near_dup_all))
for i, j, sim in near_dup_all[:20]:  # afișăm doar primele 20 de exemple
    print(f"Textul cu index {i} este near-duplicate cu textul cu index {j} (similaritate: {sim:.2f})")


Numărul de near-duplicates găsite: 321
Textul cu index 10883 este near-duplicate cu textul cu index 173064 (similaritate: 0.96)
Textul cu index 19393 este near-duplicate cu textul cu index 150768 (similaritate: 0.96)
Textul cu index 19578 este near-duplicate cu textul cu index 164366 (similaritate: 0.94)
Textul cu index 19730 este near-duplicate cu textul cu index 156020 (similaritate: 0.98)
Textul cu index 19796 este near-duplicate cu textul cu index 128947 (similaritate: 0.97)
Textul cu index 21040 este near-duplicate cu textul cu index 164315 (similaritate: 0.92)
Textul cu index 23280 este near-duplicate cu textul cu index 153830 (similaritate: 0.93)
Textul cu index 23363 este near-duplicate cu textul cu index 135092 (similaritate: 0.93)
Textul cu index 23380 este near-duplicate cu textul cu index 143442 (similaritate: 0.92)
Textul cu index 23397 este near-duplicate cu textul cu index 150794 (similaritate: 0.92)
Textul cu index 23520 este near-duplicate cu textul cu index 161084 (si

In [None]:
import csv

# Salvează lista near_dup_all într-un fișier CSV
output_file = "near_duplicates_CC_MC_WF_merged.csv"
with open(output_file, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    # Scrie antetul
    writer.writerow(["index1", "index2", "similarity"])
    # Scrie fiecare tuplu (index1, index2, score)
    for i, j, sim in near_dup_all:
        writer.writerow([i, j, sim])

print(f"Rezultatele au fost salvate în fișierul '{output_file}'.")


Rezultatele au fost salvate în fișierul 'near_duplicates_merged.csv'.


In [3]:
import csv

# Salvează lista near_dup_all într-un fișier CSV, inclusiv coloana "source" pentru fiecare articol
output_file = "near_duplicates_CC_MC_WF_merged.csv"
with open(output_file, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    # Scrie antetul: includem indexele, similaritatea și sursele pentru fiecare articol duplicat
    writer.writerow(["index1", "index2", "similarity", "source1", "source2"])
    # Pentru fiecare tuplu (i, j, sim), extragem sursa din DataFrame
    for i, j, sim in near_dup_all:
        source_i = df.iloc[i]["source"]
        source_j = df.iloc[j]["source"]
        writer.writerow([i, j, sim, source_i, source_j])

print(f"Rezultatele au fost salvate în fișierul '{output_file}'.")

Rezultatele au fost salvate în fișierul 'near_duplicates_CC_MC_WF_merged.csv'.


In [5]:
df[df['label'] == 1].shape[0]


4393

In [4]:
import textwrap
index = 117768                  
  # poți schimba cu orice alt rând vrei să vezi
text = df["text"].iloc[index]
label = df["label"].iloc[index]
category = df["category"].iloc[index]
language = df["language"].iloc[index]
wrapped = textwrap.fill(text, width=100)
word_count  = len(text.split())
print(f"Numărul de cuvinte: {word_count}")
print(wrapped)
print(f"Textul cu index {index} (eticheta: {label}):")
print(f"Category: {category}")
print(f"language: {language}")
print("-------------------------------------------------")
index2 = 112520  
label2 = df["label"].iloc[index2]
category2 = df["category"].iloc[index2]
language2 = df["language"].iloc[index2]
print(f"language: {language2}")
print(f"Category: {category2}")
print(f"Textul cu index {index2} (eticheta: {label2}):")
text2 = df["text"].iloc[index2]

wrapped2 = textwrap.fill(text2, width=100)
print(wrapped2)

Numărul de cuvinte: 1564
An Islamic State disciple kills 39 New Year’s revelers at an Istanbul nightclub. A gunman with a
police badge assassinates Russia’s ambassador at an Ankara reception. Kurdish separatist bombers
kill 14 soldiers on a bus in central Turkey and dozens of police officers at an Istanbul soccer
match. Those assaults were just in the last few weeks, which made a car bombing on Thursday in the
city of Izmir, where at least two civilians were killed, seem relatively minor. The 75 million
people of Turkey, the NATO member and European Union aspirant that straddles Europe and Asia and was
once seen as a stable democracy, are facing a ferocious onslaught of terrorist attacks unlike
anything that has been seen in the West. Add to that the tumult from roughly three million Syrian
war refugees, a resurgent Kurdish insurgency and a failed military coup  —   all tied, in the eyes
of many Turks, to American negligence, or malice, or both. President Recep Tayyip Erdogan has
respo

In [5]:
# lets see if in near_dup_all are texts with different labels
# let's check if in near_dup_all are texts with different labels
different_labels = []
for i, j, sim in near_dup_all:
    if df.iloc[i]["label"] != df.iloc[j]["label"]:
        different_labels.append((i, j, sim))
print(f"Numărul de perechi cu etichete diferite: {len(different_labels)}")
for i, j, sim in different_labels[:20]:  # afișăm doar primele 20 de exemple
    print(f"Textul cu index {i} (eticheta: {df.iloc[i]['label']}) este near-duplicate cu textul cu index {j} (eticheta: {df.iloc[j]['label']}) (similaritate: {sim:.2f})")

    

Numărul de perechi cu etichete diferite: 0


In [8]:
df.sample(5)

Unnamed: 0,news_id,title,url,publish_date,source,text,label,n_tweets,n_retweets,n_replies,n_users,tweet_ids,retweet_ids,reply_ids,user_ids,retweet_relations,reply_relations,data_name,language,category
20691,RealHealth-212946,Judges Are Forcing People to Get Mental Health...,https://www.vice.com/en_us/article/kzv8ba/assi...,2018-11-12 23:24:00,,When mental illness hijacks Margaret Rodgers’ ...,0,5,0,0,5,"1062135248065044480,1062139894993223680,106215...",,,5885139743241420165462640424033719288586365,,,RealHealth,en,WELLNESS & HEALTH
24402,SyriaHealth-130597,Star soccer player turned rebel icon dies in S...,https://www.reuters.com/article/us-syria-secur...,2019-06-08 00:00:00,,"AZAZ, Syria (Reuters) - A Syrian star soccer p...",0,42,4,0,46,"1137375165728284674,1137375500635009025,113737...","1137388906356334600,1137411243596165120,113755...",,"749646098273144832,728200388890001408,76427902...",1137388906356334600-1137388053415235586-271734...,,RealSyria,en,WORLDPOST
15015,RealCovid_000115,Coronavirus a global health emergency? WHO chi...,,,,The head of the World Health Organization on W...,0,6,0,0,5,"1222576978160758785,1222581586522230784,122258...",,,"1196340921303891968,1216824204013654016,122203...",,,RealCovid,en,WORLD NEWS
4768,gossipcop-919177,why hoda kotb isn’t making matt lauer’s salary...,https://variety.com/2018/tv/news/hoda-kotb-sal...,1515009580.0,https://variety.com,"In the world of TV news, there’s an inordinate...",0,14,0,0,11,"948645557601480705,948646467992014848,94864695...",,,"748194188060139520,754545110818426881,71859482...",,,gossipcop,en,MEDIA
20713,RealHealth-220945,How Mainstream Emo Made Us Talk About Mental H...,https://www.vice.com/en_us/article/rgp7wb/how-...,2016-05-18 10:38:00,,This feature is part of 'The Noisey Guide to M...,0,167,165,16,330,"732909833771421696,732915363109801984,73294194...","732910832191967232,732911432451411970,73291385...","733046296248995840,733094873490960384,73309487...","1229168245921456128,751308498047111169,9765635...",732910832191967232-732909833771421696-15995155...,733046296248995840-733044528999866369-28868871...,RealHealth,en,WELLNESS & HEALTH


In [9]:
df.shape

(28133, 20)

In [10]:
def remove_near_duplicates(df, near_dup_all, label_col="label"):
    """
    Elimină near-duplicatele dintr-un DataFrame conform regulilor:
      - Dacă (i, j) au același label => eliminăm doar j.
      - Dacă (i, j) au label diferit => eliminăm ambele.
    
    Args:
      df (pd.DataFrame): DataFrame cu index 0..n-1 sincronizat cu near_dup_all.
      near_dup_all (list of tuples): listă de (i, j, sim).
      label_col (str): numele coloanei de label.
    
    Returnează:
      df_curatat (pd.DataFrame): DataFrame după eliminare
      conflict_count (int): număr de perechi care au label diferit
      removed_count (int): număr total de articole eliminate
    """
    
    to_remove = set()       # set de indexi care vor fi eliminați
    conflict_count = 0      # câte perechi au label diferit
    
    for (i, j, sim) in near_dup_all:
        # Dacă deja i sau j a fost marcat pentru ștergere, sărim peste
        if i in to_remove or j in to_remove:
            continue
        
        label_i = df.iloc[i][label_col]
        label_j = df.iloc[j][label_col]
        
        if label_i != label_j:
            # Avem conflict, eliminăm ambele
            conflict_count += 1
            to_remove.add(i)
            to_remove.add(j)
        else:
            # Au același label => eliminăm j
            to_remove.add(j)
    
    # Construim noul DataFrame fără articolele eliminate
    initial_count = len(df)
    df_curatat = df.drop(labels=to_remove, axis=0)
    df_curatat.reset_index(drop=True, inplace=True)  # Reindexăm dacă e nevoie
    
    removed_count = len(to_remove)
    
    return df_curatat, conflict_count, removed_count

In [11]:
df_clean, conflict_count, removed_count = remove_near_duplicates(df, near_dup_all, label_col="label")

    # 4) Afișăm statistici
print(f"Număr de perechi cu label diferit (conflicte): {conflict_count}")
print(f"Număr total de articole eliminate: {removed_count}")
print(f"Dimensiunea inițială a datasetului: {len(df)}")
print(f"Dimensiunea finală a datasetului: {len(df_clean)}")

Număr de perechi cu label diferit (conflicte): 34
Număr total de articole eliminate: 785
Dimensiunea inițială a datasetului: 28133
Dimensiunea finală a datasetului: 27348


In [14]:
df_clean[df_clean['label'] == 1].shape[0]

4218

In [12]:
print(df.shape)
print(df_clean.shape)

(28133, 20)
(27348, 20)


In [15]:
df_clean.sample(5)

Unnamed: 0,news_id,title,url,publish_date,source,text,label,n_tweets,n_retweets,n_replies,n_users,tweet_ids,retweet_ids,reply_ids,user_ids,retweet_relations,reply_relations,data_name,language,category
2509,gossipcop-2628357669,angelina jolie & brad pitt kicked off valentin...,perezhilton.com/2013-02-15-angelina-jolie-brad...,1360886400.0,http://perezhilton.com,Angelina Jolie & Brad Pitt Kicked Off Valentin...,1,100,0,0,98,"302538297807339520,302538659960328192,30253951...",,,"241058308,101366796,42277406,75464738,36522089...",,,gossipcop,en,PARENTING
18362,RealCovid_006148,Coronavirus could live in the body for 5 weeks...,,,,"From coronavirus reports on new symptoms, self...",0,27,13,0,36,"1238541123800281088,1238545522400657410,123854...","1238660065864945664,1238660767999803393,123866...",,"3053739264,843691263878729728,1090879834740256...",1238660065864945664-1238655870659166209-109134...,,RealCovid,en,WELLNESS & HEALTH
21417,RealHealth-624240,"Tropical Depression Three forms over Bahamas, ...",https://www.reuters.com/article/us-storm-tropi...,2019-07-22 00:00:00,,(Reuters) - Tropical Depression Three has form...,0,8,0,0,7,"1153414031606239232,1153430798659440641,115345...",,,"1141007227395936256,855845233610768385,8020685...",,,RealHealth,en,ENVIRONMENT
3816,gossipcop-3885971599,malibu crash involving bruce jenner leaves 1 d...,www.latimes.com/local/lanow/la-me-ln-bruce-jen...,1423238400.0,http://www.latimes.com,Former Olympian Bruce Jenner was a driver in a...,1,348,256,28,498,"564194031152795648,564194284513927169,56419509...","564194266218369025,564194663288950784,56419476...","564202181826318336,564212153549611009,56421409...","268899328,1148765700,15750149,380873739,275701...",564194266218369025-564194031152795648-58422278...,564202181826318336-564194031152795648-28899160...,gossipcop,en,CRIME
22864,RealHealth-910087,Industry funded studies don't find sweet drink...,http://www.reuters.com/article/us-health-resea...,2016-10-31,,By Andrew M. Seaman (Reuters Health) - Do suga...,0,348,41,4,358,"793219861568630784,793219863892209664,79321987...","793229643742523393,793230363594067968,79323325...","793240101123948544,793456228567572484,79348030...","749978398886330368,890041994008961025,76010817...",793229643742523393-793226485226147840-91826981...,793240101123948544-793226485226147840-15389667...,RealHealth,en,WELLNESS & HEALTH


In [17]:
#drop everything except title, text, label, category, language
df_clean = df_clean[['title', 'text', 'label', 'category', 'language']]


In [18]:
df_clean.to_csv("../datasets/MC_Fake_cleaned.csv", index=False)

## Combine CC and WELFake

In [20]:
import pandas as pd
df_ccwf = pd.read_csv("../datasets/CC_WELF_merged_cleaned.csv")
df_ccwf.sample(5)

Unnamed: 0,text,label,language,category,title
7663,Chris Wallace Refuses To Help Liberals Smear T...,0,en,POLITICS,
25702,The second trailer for Beauty and the Beast ha...,1,en,ENTERTAINMENT,
132984,The White House Domestic Policy Council (DPC) ...,1,en,POLITICS,White House domestic advisers take lead on ref...
120052,President Barack Obama on Sunday campaigned in...,1,en,POLITICS,Obama turns focus to U.S. Congress as he campa...
130926,Contact Us WikiLeaks: Hillary Admits Clinton F...,0,en,POLITICS,WikiLeaks: Hillary Admits Clinton Foundation D...


In [21]:
df_mc = pd.read_csv("../datasets/MC_Fake_cleaned.csv")
df_mc.sample(5)

Unnamed: 0,title,text,label,category,language
14616,British Airways suspends all flights to China ...,British Airways has suspended all direct fligh...,0,TRAVEL,en
12617,‘handmaid’s tale’ fans aren’t here for offred’...,(Spoiler alert: Please do not read ahead if yo...,0,ENTERTAINMENT,en
19536,Sleep Deprivation Is a Surprisingly Effective ...,"Angelina is coming to life, precisely as my bo...",0,WELLNESS & HEALTH,en
9699,'what really hurt': jay-z opens up about alleg...,Jay-Z said Kanye West's rant against him last ...,0,ENTERTAINMENT,en
9243,golden globes 2019: the complete list of nominees,The nominees are in for next month’s 76th annu...,0,ENTERTAINMENT,en


In [None]:
# df_wf = df_wf.drop(columns=['Unnamed: 0'])
# df_wf.head(5)

Unnamed: 0,title,text,label,language,category
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,en,MINORITY VOICES
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,en,POLITICS
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,en,RELIGION
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,en,WORLDPOST
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,en,RELIGION


In [24]:
# au LABEL-urile OPUSE => pastram labelurile CC pentru ca sunt mai multe
df_mc[df_mc['label'] == 1].shape[0]
# df_mc['label'] = 1 - df_mc['label']
# df_mc.head(5)

23130

In [25]:
df_mc['text'] = [x.encode('ascii', 'ignore').decode('ascii') for x in df_mc['text']]


In [26]:
df_merged = pd.concat([df_ccwf, df_mc], ignore_index=True, sort=False)
df_merged.sample(5)

Unnamed: 0,text,label,language,category,title
96792,Officers with U. S. Customs and Border Protect...,1,en,CRIME,Feds Seize $7.2 Million in Drugs at Border in Day
32993,Bats rely on the position of the sun at sunset...,1,en,SCIENCE,
21050,All the Creepy Ways Big Brother Is Watching Yo...,0,en,TECH,
22286,"Sheku Kanneh-Mason, 17, is a cellist who in Ma...",1,en,ENTERTAINMENT,
113268,(Want to get this briefing by email? Heres the...,1,en,POLITICS,"Republican Convention, Turkey, Garry Marshall:..."


In [27]:
df_merged['category'].value_counts()

category
POLITICS              61969
ENTERTAINMENT         18495
WORLDPOST             17535
WELLNESS & HEALTH     13486
SPORTS                 7837
WORLD NEWS             7626
BUSINESS               6727
MINORITY VOICES        3640
TRAVEL                 3640
CRIME                  3363
ENVIRONMENT            2927
ARTS & CULTURE         2822
PARENTING              2616
FOOD & DRINK           2606
MEDIA                  2384
STYLE & BEAUTY         2136
TECH                   1898
COMEDY                 1892
SCIENCE                1309
IMPACT & GOOD NEWS     1307
RELIGION               1178
WOMEN & FIFTY           883
EDUCATION               875
MONEY                   621
WEIRD NEWS              461
HOME & LIVING           428
U.S. NEWS               418
WEDDINGS                325
DIVORCE                 242
Name: count, dtype: int64

In [28]:
df_merged['label'].value_counts()

label
1    100198
0     71448
Name: count, dtype: int64

In [29]:
df_merged.to_csv("../datasets/CC_MC_WELF_merged.csv", index=False)