# Data clearner

## Dependencies

In [1]:
import pandas as pd
import cleantext
from tqdm import tqdm

## Cleaning

### Loading the raw data

In [2]:
# Set a chunk size (e.g., 1000 rows per chunk, adjust based on file size)
chunk_size = 1000
rows = 10000
chunks = []

# Use tqdm to show progress while iterating over chunks
with tqdm(desc="Loading csv file", total=rows) as pbar:
    for chunk in pd.read_csv("fake_news_dataset/10,000_data_raw.csv",nrows=rows, chunksize=chunk_size):
        chunks.append(chunk)
        pbar.update(chunk_size)

# Combine all chunks into a single DataFrame
df = pd.concat(chunks, ignore_index=True)

Loading csv file: 100%|██████████| 10000/10000 [00:00<00:00, 32913.46it/s]


In [3]:
if len(df.columns) == 18:
    print("Too many columns, removing first")
    df = df.drop(df.columns[0], axis=1)
else:
    print("Columns are as expected.")
    # ? Failsafe in case an unnamed column too many appears

Columns are as expected.


### cleaning med 'clean'

In [4]:
print(len(df))
df['content'] = df['content'].apply(lambda x: cleantext.clean(text=x) if isinstance(x, str) else None)
df = df.dropna(subset=['content'])
print(df['content'].head(5))

10000
0    plu one articl googl plu thank ali alfoneh ass...
1    cost best senat bank committe jp morgan buy br...
2    man awoken year coma commit suicid learn donal...
3    julia geist ask draw pictur comput scientist l...
4    – compil studi vaccin danger activist post sep...
Name: content, dtype: object


### cleaning med 'clean_words'

In [5]:
df['content'] = df['content'].apply(lambda x: cleantext.clean_words(
    text=x,
    clean_all=True,
    extra_spaces=True,
    stemming=True,
    stopwords=True,
    stp_lang='english',
) if isinstance(x, str) else None)
df = df.dropna(subset=['content'])

print(df['content'].head(5))
print("\nArticles Left: ", len(df))

0    [plu, one, articl, googl, plu, thank, ali, alf...
1    [cost, best, senat, bank, committ, jp, morgan,...
2    [man, awoken, year, coma, commit, suicid, lear...
3    [julia, geist, ask, draw, pictur, comput, scie...
4    [–, compil, studi, vaccin, danger, activist, p...
Name: content, dtype: object

Articles Left:  10000


### Data export

In [6]:
df.to_csv('fake_news_dataset/10,000_data_cleaned.csv', index=False)

## Categorization

In [7]:
# Define the conversion dictionary
convert = {
    '': "skip", 'conspiracy': '0', 'satire': '0', 'reliable': '1',
    'unreliable': 'skip', 'junksci': '0', 'unknown': 'skip',
    'political': '1', 'fake': '0', 'hate': '0',
    'clickbait': '1', 'bias': '1', 'rumor': '0'
}

# Create a new column with converted categories
df['processed_category'] = df.iloc[:, 3].map(convert)
df["type"] = df["processed_category"]
# df.drop(columns=['processed_category'], inplace=True)
# Filter out 'skip' categories
df = df[df['type'] != 'skip']

# Count fake and reliable articles
fake_count = (df['type'] == '0').sum()
real_count = (df['type'] == '1').sum()
total_count = len(df)

# Print statistics
print(f"Number of articles left: {total_count}")
print(f"Reliable articles: {real_count}")
print(f"Fake articles: {fake_count}")
print(f"Ratio of fakes: {fake_count/total_count:.4f}")
print(f"Ratio of reliable: {real_count/total_count:.4f}")

# Optional: If you want to reset the index after filtering
df_filtered = df.reset_index(drop=True)

Number of articles left: 9339
Reliable articles: 5395
Fake articles: 3496
Ratio of fakes: 0.3743
Ratio of reliable: 0.5777


In [8]:
fake_count, real_count

(np.int64(3496), np.int64(5395))

### Export

In [9]:
df.to_csv('fake_news_dataset/10,000_data_cleaned_fr.csv', index=False)

## Cleaning LIAR

In [10]:
df = pd.read_csv('liar_dataset/train.tsv', sep='\t', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,5473.json,mostly-true,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,,Florida,none,0.0,1.0,1.0,1.0,0.0,"interview on ""The Colbert Report"""
10236,3408.json,mostly-true,Democrats have now become the party of the [At...,elections,alan-powell,,Georgia,republican,0.0,0.0,0.0,1.0,0.0,an interview
10237,3959.json,half-true,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,,Georgia,republican,4.0,11.0,5.0,3.0,3.0,a Republican presidential debate
10238,2253.json,false,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...


In [11]:
print(len(df))
df[2] = df[2].apply(lambda x: cleantext.clean(x))
df = df.dropna(subset=[2])
print(df[2].head(5))
print(len(df))

10240
0    say anni list polit group support thirdtrimest...
1    declin coal start start natur ga took start be...
2    hillari clinton agre john mccain vote give geo...
3    health care reform legisl like mandat free sex...
4                     econom turnaround start end term
Name: 2, dtype: object
10240


In [12]:
df[2] = df[2].apply(lambda x: cleantext.clean_words(
    text=x,
    clean_all=True,
    extra_spaces=True,
    stemming=True,
    stopwords=True,
    stp_lang='english',
) if isinstance(x, str) else None)
df = df.dropna(subset=[2])

print(df[2].head(5))
print("\nArticles Left: ", len(df))

0    [say, anni, list, polit, group, support, third...
1    [declin, coal, start, start, natur, ga, took, ...
2    [hillari, clinton, agr, john, mccain, vote, gi...
3    [health, care, reform, legisl, like, mandat, f...
4               [econom, turnaround, start, end, term]
Name: 2, dtype: object

Articles Left:  10240


In [13]:
df.to_csv('liar_dataset/liar_dataset_cleaned.csv')

In [17]:

# Define headers
headers = ["number", "id", "type", "content", "speaker", "job_title", "state",
           "party", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire",
           "context", "yap"]

# Dictionary for label conversion
convert = {
    '': "skip", 'true': 1, 'half-true': 'skip', 'pants-fire': 0,
    'mostly-true': 1, 'barely-true': 0, 'false': 0
}
df = pd.read_csv("liar_dataset/liar_dataset_cleaned.csv") # ? Sometimes df fails when kept in memory so we load it just to be sure

# Ensure the dataframe has the correct number of columns
df = df.iloc[:, :len(headers)]

# Rename columns to match headers if needed
df.columns = headers[:len(df.columns)]

# Convert categories using the dictionary
# Use apply instead of map to handle potential list or complex data types
df['type'] = df['type'].apply(lambda x: convert.get(str(x).lower().strip(), 'skip'))

# Filter out 'skip' categories
df_filtered = df[df['type'] != 'skip']

# Count fake and reliable articles
fake_count = (df_filtered['type'] == 0).sum()
real_count = (df_filtered['type'] == 1).sum()

# Calculate and print ratios
total_count = len(df_filtered)
print(f"Fake ratio: {fake_count/total_count:.4f}")
print(f"Real ratio: {real_count/total_count:.4f}")

# Save filtered dataframe to CSV
df_filtered.to_csv('liar_dataset/liar_fr.csv', index=False)

# Print additional statistics
print(f"Total articles after filtering: {total_count}")
print(f"Fake articles: {fake_count}")
print(f"Reliable articles: {real_count}")

Fake ratio: 0.5523
Real ratio: 0.4477
Total articles after filtering: 8126
Fake articles: 4488
Reliable articles: 3638
