# Data clearner

## Dependencies

In [17]:
import pandas as pd
import cleantext
import csv
import sys

## Cleaning

### Loading the raw data

In [18]:
df = pd.read_csv('data_raw.csv')
print(df['content'].head(5))

0    Sometimes the power of Christmas will make you...
1    AWAKENING OF 12 STRANDS of DNA – “Reconnecting...
2    Never Hike Alone: A Friday the 13th Fan Film U...
3    When a rare shark was caught, scientists were ...
4    Donald Trump has the unnerving ability to abil...
Name: content, dtype: object


### cleaning med clean

In [19]:
df['content'] = df['content'].apply(lambda x: cleantext.clean(text=x))
print(df['content'].head(5))

0    sometim power christma make wild wonder thing ...
1    awaken strand dna – “reconnect you” movi reade...
2    never hike alon friday th fan film usa min fan...
3    rare shark caught scientist left blunder answe...
4    donald trump unnerv abil abil creat realiti co...
Name: content, dtype: object


### cleaning med clean_words

In [20]:
df['content'] = df['content'].apply(lambda x: cleantext.clean_words(
    text=x,
    clean_all=True,
    extra_spaces=True,
    stemming=True,
    stopwords=True,
    stp_lang='english',
))

print(df['content'].head(5))

0    [sometim, power, christma, make, wild, wonder,...
1    [awaken, strand, dna, –, “reconnect, you”, mov...
2    [never, hike, alon, friday, th, fan, film, usa...
3    [rare, shark, caught, scientist, left, blunder...
4    [donald, trump, unnerv, abil, abil, creat, rea...
Name: content, dtype: object


### Data export

In [None]:
df.to_csv('data_cleaned.csv')

## Categorization

In [None]:
temp = [] 
categories = set()  
convert = {
    '': "skip", 'conspiracy': 'fake', 'satire': 'fake', 'reliable': 'reliable',
    'unreliable': 'skip', 'junksci': 'fake', 'unknown': 'skip',
    'political': 'reliable', 'fake': 'fake', 'hate': 'fake',
    'clickbait': 'reliable', 'bias': 'reliable', 'rumor': 'fake'
}

fakeCount = 0 
realCount = 0 

csv.field_size_limit(sys.maxsize)

with open("data_cleaned.csv", "r") as src: 
    reader = csv.reader(src) 
    header = next(reader)
        
    for row in reader:  
        content = row[4]  # Extract category from the fourth column
        categories.add(content)  # Store the unique category
            
        row[4] = convert[row[4]]  # Convert category using the dictionary
            
        if row[4] == "fake":   # 0 for fake
            fakeCount += 1  # Increment fake count
        elif row[4] == "reliable": # 1 for reliable
            realCount += 1  # Increment real count
            
        if row[4] != "skip":  # Skip unwanted categories
            temp.append(row)  # Append valid rows to the list

# Create a DataFrame from the filtered list, keeping the original column names
df = pd.DataFrame(temp, columns=header)

# Print the ratio of fake vs. real news articles and number of articles
print(f"Number of articles left: {len(temp)}")
print(f"Reliable articles: {realCount}")
print(f"Fake articles: {fakeCount}")
print(f"Ratio of fakes: {fakeCount/(fakeCount+realCount)}")
print(f"Ratio of reliable: {realCount/(fakeCount+realCount)}")

Number of articles left: 232
Reliable articles: 33
Fake articles: 199
Ratio of fakes: 0.8577586206896551
Ratio of reliable: 0.14224137931034483


### Export

In [None]:
df.to_csv('data_cleaned_fr.csv', index=False)