In [1]:
import pandas as pd
from langdetect import detect, DetectorFactory
from concurrent.futures import ProcessPoolExecutor

# Pentru a asigura reproducibilitatea rezultatelor în detectarea limbii
DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)
    except Exception:
        return "unknown"

# Încarcă datasetul
path = "../datasets/MC_Fake_dataset.csv"
df = pd.read_csv(path)

# Folosește ProcessPoolExecutor pentru a procesa în paralel
with ProcessPoolExecutor() as executor:
    languages = list(executor.map(detect_language, df["text"]))

df["language"] = languages

# Salvează rezultatul, suprascriind fișierul
df.to_csv(path, index=False)


In [3]:
df['language'].value_counts()

language
en         27982
unknown      203
es            46
fr            17
pt            10
it             9
hr             7
ca             7
hi             6
da             5
id             5
ur             4
uk             4
de             4
te             3
lt             3
ta             2
nl             2
ru             2
sv             2
ja             2
so             1
ro             1
pl             1
gu             1
th             1
zh-tw          1
bn             1
fa             1
ko             1
sk             1
Name: count, dtype: int64

In [2]:
df.sample(10)

Unnamed: 0,news_id,title,url,publish_date,source,text,labels,n_tweets,n_retweets,n_replies,n_users,tweet_ids,retweet_ids,reply_ids,user_ids,retweet_relations,reply_relations,data_name,language
20431,RealHealth-76331,Britney Spears Hits Up Tanning Salon After Sta...,https://www.tmz.com/2019/04/26/britney-spears-...,2019-04-26 00:00:00,,Britney Spears is back home and getting back i...,0,52,85,82,193,"1121830685553987584,1121831641201115136,112183...","1121830772984426502,1121831065016889344,112183...","1121830843192754177,1121831505385480193,112183...","733373111815557120,790019230389248000,10624833...",1121830772984426502-1121830685553987584-301268...,1121830843192754177-1121830685553987584-105276...,RealHealth,en
20269,RealHealth-30631,Trump’s EPA is suppressing a report about form...,https://www.vox.com/science-and-health/2018/7/...,2018-07-06 19:10:01,,Trump administration officials at the Environm...,0,122,167,15,288,"1015266744863281152,1015311934428794881,101531...","1015311938832740352,1015312114070761474,101531...","1015312049746989058,1015317640460623873,101533...","857996320786853889,887496131399213057,10906560...",1015311938832740352-1015311934428794881-291894...,1015312049746989058-1015311934428794881-141761...,RealHealth,en
4758,gossipcop-899393,nicole kidman and more react to golden globe n...,https://www.cnn.com/2018/12/06/entertainment/g...,1544025600.0,https://www.cnn.com,The lucky nominees are celebrating and here's ...,0,112,30,4,132,"808319559992283136,808346939670425600,80834796...","808378884014096384,808387718195671040,94025960...","940400966486188033,940525951707951104,10710491...","799484072439513088,725386630593437697,70312739...",808378884014096384-808373378411327489-20580622...,940400966486188033-940400279605972993-37133511...,gossipcop,en
5487,gossipcop-846346,the world’s 9 most powerful women entrepreneur...,https://blog.invoiceberry.com/2016/05/worlds-p...,,https://blog.invoiceberry.com,Unlike in the past where entrepreneurship was ...,0,40,13,0,31,"733659337772703745,735742481342435328,76081146...","760921707184029697,852567093673316352,85257903...",,"826349135812898816,902042330575683584,78432407...",760921707184029697-760811460226408449-73500411...,,gossipcop,en
3840,gossipcop-2940775250,kylie jenner admits she made her lips 'too big...,www.dailymail.co.uk/femail/article-3688685/Tha...,1468427617.0,http://www.dailymail.co.uk,The star also said she plans to step away from...,1,59,2,1,60,"753275081057599488,753280608986476544,75328751...",754273494591016960764921412150755329,754076039559643136,"744402309166292992,721982469017350144,71542178...",754273494591016960-753929690155331584-42834037...,754076039559643136-753599958293491713-16315179...,gossipcop,en
19197,RealCovid_006091,Interactive coronavirus heat map shows spread ...,,,,"Dr. George Diaz, Section Chief of Infectious D...",0,49,13,2,59,"1220880349732855809,1220880351330856963,122088...","1220882521296117762,1220893365555777537,122119...",12211355560772198411236850663113424896,"934901520935645184,1122083692325818369,1157952...",1220882521296117762-1220880351330856963-180081...,1221135556077219841-1221133676530085888-850534...,RealCovid,en
25614,SyriaHealth-871183,"Top US commander warns Russia, Syria",https://www.cnn.com/2016/08/21/politics/us-war...,2016-08-21 00:00:00,,(CNN)In the most direct public warning to Mos...,0,972,876,330,1876,"767270888790253572,767270957933219844,76727097...","767271025255915521,767271105476329472,76727128...","767271350503297026,767272028004941824,76727204...","785211579595751424,718219178470154240,70437755...",767271025255915521-767270888790253572-20466212...,767271350503297026-767270888790253572-74585028...,RealSyria,en
6800,gossipcop-940416,mixin’ business with pleasure: celebs who date...,https://madamenoire.com/500775/celebs-who-date...,1422367849.0,https://madamenoire.com,"With certain celebrities, sometimes the line b...",0,4,1,0,5,"560083549106495488,560086939362394112,56035032...",560351722020036608,,"1539021318,2374722919,2470015026,138516405,145...",560351722020036608-560350324805107712-15390213...,,gossipcop,en
19632,RealCovid_006594,Ex-NBA star Jeremy Lin jabs Trump for calling ...,,,,\nGet all the latest news on coronavirus and m...,0,5,0,0,5,"1240968322956644352,1240969668225118208,124097...",,,"982981684256620544,1090879834740256768,1025186...",,,RealCovid,en
7479,gossipcop-880277,beyonce makes first appearance since giving birth,https://www.cheatsheet.com/entertainment/beyon...,1505517027.0,https://www.cheatsheet.com,Beyonce attended Rihanna’s charity Diamond Bal...,0,124,147,6,260,"159802316566962176,166751068976447488,16675229...","166756700588752896,166779353605226496,16684123...","167195157291212800,885651320983752704,88565599...","846601544753647616,808812687874490369,86339348...",166756700588752896-166753537718628352-32096066...,167195157291212800-167187168853372929-27111608...,gossipcop,en
