# Data clearner

## Dependencies

In [10]:
import pandas as pd
import cleantext
import csv
import sys
from tqdm import tqdm

## Cleaning

### Loading the raw data

In [9]:
# Set a chunk size (e.g., 1000 rows per chunk, adjust based on file size)
chunk_size = 1000
rows = 995000
chunks = []

# Use tqdm to show progress while iterating over chunks
with tqdm(desc="Loading csv file", total=rows) as pbar:
    for chunk in pd.read_csv("995,000_rows.csv",nrows=rows, chunksize=chunk_size):
        chunks.append(chunk)
        pbar.update(chunk_size)

# Combine all chunks into a single DataFrame
df = pd.concat(chunks)

Loading csv file: 100%|██████████| 995000/995000 [00:22<00:00, 43346.19it/s]


### cleaning med clean

In [4]:
print(len(df))
df['content'] = df['content'].apply(lambda x: cleantext.clean(text=x) if isinstance(x, str) else None)
df = df.dropna(subset=['content'])
print(df['content'].head(5))
print(len(df))

995000


KeyboardInterrupt: 

### cleaning med clean_words

In [None]:
df['content'] = df['content'].apply(lambda x: cleantext.clean_words(
    text=x,
    clean_all=True,
    extra_spaces=True,
    stemming=True,
    stopwords=True,
    stp_lang='english',
) if isinstance(x, str) else None)
df = df.dropna(subset=['content'])

print(df['content'].head(5))
print("\nArticles Left: ", len(df))

### Data export

In [None]:
df.to_csv('data_cleaned.csv')

## Categorization

In [22]:
temp = [] 
convert = {
    '': "skip", 'conspiracy': 'fake', 'satire': 'fake', 'reliable': 'reliable',
    'unreliable': 'skip', 'junksci': 'fake', 'unknown': 'skip',
    'political': 'reliable', 'fake': 'fake', 'hate': 'fake',
    'clickbait': 'reliable', 'bias': 'reliable', 'rumor': 'fake'
}

fakeCount = 0 
realCount = 0 

csv.field_size_limit(sys.maxsize)

with open("data_cleaned.csv", "r") as src: 
    reader = csv.reader(src) 
    header = next(reader)
        
    for row in reader:  
        try:
            row[4] = convert[row[4]]  # Convert category using the dictionary
                
            if row[4] == "fake":   # 0 for fake
                fakeCount += 1  # Increment fake count
            elif row[4] == "reliable": # 1 for reliable
                realCount += 1  # Increment real count
                
            if row[4] != "skip":  # Skip unwanted categories
                temp.append(row)  # Append valid rows to the list
                # print(row[4])
        except Exception as e:
            # Skip the bad rows
            print("Error on row: ", row)
            continue
            

# Create a DataFrame from the filtered list, keeping the original column names
df = pd.DataFrame(temp, columns=header)

# Print the ratio of fake vs. real news articles and number of articles
print(f"Number of articles left: {len(temp)}")
print(f"Reliable articles: {realCount}")
print(f"Fake articles: {fakeCount}")
print(f"Ratio of fakes: {fakeCount/(fakeCount+realCount)}")
print(f"Ratio of reliable: {realCount/(fakeCount+realCount)}")

Error on row:  ['908192', '', 'Financials   7:50am EST BRIEF-Al Tawfeek Co for Financial Leasing Q3 profit rises \nNov 13 Al Tawfeek Company for Financial Leasing \n* Q3 consol total operating revenue EGP 117.2 million versus EGP 102.2 million year ago \n* Q3 consol net profit EGP 8.7 million versus EGP 8.4 million year ago Source: ( bit.ly/2etVQAu ) Further company coverage: Next In Financials', '2016-11-13T15:38:41.407+02:00', '2018-02-10 13:43:39.521661', '2018-02-10 13:43:39.521686', "['briefal', 'tawfeek', 'co', 'financ', 'lea', 'q', 'profit', 'rise']", '', '', '', '', '', '', 'webhose', '', '', '', '']
Number of articles left: 868346
Reliable articles: 573725
Fake articles: 294621
Ratio of fakes: 0.33928986832437763
Ratio of reliable: 0.6607101316756224


In [6]:
import pandas as pd

# Define the conversion dictionary
convert = {
    '': "skip", 'conspiracy': 'fake', 'satire': 'fake', 'reliable': 'reliable',
    'unreliable': 'skip', 'junksci': 'fake', 'unknown': 'skip',
    'political': 'reliable', 'fake': 'fake', 'hate': 'fake',
    'clickbait': 'reliable', 'bias': 'reliable', 'rumor': 'fake'
}

# Read the CSV file
df = pd.read_csv("data_cleaned.csv")

# Create a new column with converted categories
df['processed_category'] = df.iloc[:, 4].map(convert)

# Filter out 'skip' categories
df_filtered = df[df['processed_category'] != 'skip']

# Count fake and reliable articles
fake_count = (df_filtered['processed_category'] == 'fake').sum()
real_count = (df_filtered['processed_category'] == 'reliable').sum()
total_count = len(df_filtered)

# Print statistics
print(f"Number of articles left: {total_count}")
print(f"Reliable articles: {real_count}")
print(f"Fake articles: {fake_count}")
print(f"Ratio of fakes: {fake_count/total_count:.4f}")
print(f"Ratio of reliable: {real_count/total_count:.4f}")

# Optional: If you want to reset the index after filtering
df_filtered = df_filtered.reset_index(drop=True)

  df = pd.read_csv("data_cleaned.csv")


Number of articles left: 916122
Reliable articles: 573725
Fake articles: 294621
Ratio of fakes: 0.3216
Ratio of reliable: 0.6263


In [12]:
fakeCount, realCount

(294621, 573725)

### Export

In [18]:
df.to_csv('data_cleaned_fr.csv', index=False)

## Cleaning LIAR

In [15]:
df = pd.read_csv('liar_dataset/train.tsv', sep='\t', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,5473.json,mostly-true,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,,Florida,none,0.0,1.0,1.0,1.0,0.0,"interview on ""The Colbert Report"""
10236,3408.json,mostly-true,Democrats have now become the party of the [At...,elections,alan-powell,,Georgia,republican,0.0,0.0,0.0,1.0,0.0,an interview
10237,3959.json,half-true,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,,Georgia,republican,4.0,11.0,5.0,3.0,3.0,a Republican presidential debate
10238,2253.json,false,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...


In [17]:
print(len(df))
df[2] = df[2].apply(lambda x: cleantext.clean(text=x) if isinstance(x, str) else None)
df = df.dropna(subset=[2])
print(df[2].head(5))
print(len(df))

10240
0    say anni list polit group support thirdtrimest...
1    declin coal start start natur ga took start be...
2    hillari clinton agre john mccain vote give geo...
3    health care reform legisl like mandat free sex...
4                     econom turnaround start end term
Name: 2, dtype: object
10240


In [18]:
df[2] = df[2].apply(lambda x: cleantext.clean_words(
    text=x,
    clean_all=True,
    extra_spaces=True,
    stemming=True,
    stopwords=True,
    stp_lang='english',
) if isinstance(x, str) else None)
df = df.dropna(subset=[2])

print(df[2].head(5))
print("\nArticles Left: ", len(df))

0    [say, anni, list, polit, group, support, third...
1    [declin, coal, start, start, natur, ga, took, ...
2    [hillari, clinton, agr, john, mccain, vote, gi...
3    [health, care, reform, legisl, like, mandat, f...
4               [econom, turnaround, start, end, term]
Name: 2, dtype: object

Articles Left:  10240


In [20]:
df.to_csv('liar_dataset_cleaned.csv')