# Data clearner

## Dependencies

In [2]:
import pandas as pd
import cleantext
import csv
import sys
from tqdm import tqdm

## Cleaning

### Loading the raw data

In [None]:
# Set a chunk size (e.g., 1000 rows per chunk, adjust based on file size)
chunk_size = 1000
rows = 995000
chunks = []

# Use tqdm to show progress while iterating over chunks
with tqdm(desc="Loading csv file", total=rows) as pbar:
    for chunk in pd.read_csv("995,000_rows.csv",nrows=rows, chunksize=chunk_size):
        chunks.append(chunk)
        pbar.update(chunk_size)

# Combine all chunks into a single DataFrame
df = pd.concat(chunks)

### cleaning med clean

In [None]:
print(len(df))
df['content'] = df['content'].apply(lambda x: cleantext.clean(text=x) if isinstance(x, str) else None)
df = df.dropna(subset=['content'])
print(df['content'].head(5))
print(len(df))

### cleaning med clean_words

In [None]:
df['content'] = df['content'].apply(lambda x: cleantext.clean_words(
    text=x,
    clean_all=True,
    extra_spaces=True,
    stemming=True,
    stopwords=True,
    stp_lang='english',
) if isinstance(x, str) else None)
df = df.dropna(subset=['content'])

print(df['content'].head(5))
print("\nArticles Left: ", len(df))

### Data export

In [None]:
df.to_csv('data_cleaned.csv')

## Categorization

In [16]:
temp = [] 
convert = {
    '': "skip", 'conspiracy': 'fake', 'satire': 'fake', 'reliable': 'reliable',
    'unreliable': 'skip', 'junksci': 'fake', 'unknown': 'skip',
    'political': 'reliable', 'fake': 'fake', 'hate': 'fake',
    'clickbait': 'reliable', 'bias': 'reliable', 'rumor': 'fake'
}

fakeCount = 0 
realCount = 0 

csv.field_size_limit(sys.maxsize)

with open("data_cleaned.csv", "r") as src: 
    reader = csv.reader(src) 
    header = next(reader)
        
    for row in reader:  
        try:
            typeTest = typeTest
            # categories.add(type)  # Store the unique category
            
            typeTest = convert[typeTest]  # Convert category using the dictionary
                
            if typeTest == "fake":   # 0 for fake
                fakeCount += 1  # Increment fake count
            elif typeTest == "reliable": # 1 for reliable
                realCount += 1  # Increment real count
                
            if typeTest != "skip":  # Skip unwanted categories
                temp.append(row)  # Append valid rows to the list
                # print(typeTest)
        except Exception as e:
            # Skip the bad rows
            print("Error on row: ", row)
            continue
            

# Create a DataFrame from the filtered list, keeping the original column names
df = pd.DataFrame(temp, columns=header)

# Print the ratio of fake vs. real news articles and number of articles
print(f"Number of articles left: {len(temp)}")
print(f"Reliable articles: {realCount}")
print(f"Fake articles: {fakeCount}")
print(f"Ratio of fakes: {fakeCount/(fakeCount+realCount)}")
print(f"Ratio of reliable: {realCount/(fakeCount+realCount)}")

Error on row:  ['908192', '', 'Financials   7:50am EST BRIEF-Al Tawfeek Co for Financial Leasing Q3 profit rises \nNov 13 Al Tawfeek Company for Financial Leasing \n* Q3 consol total operating revenue EGP 117.2 million versus EGP 102.2 million year ago \n* Q3 consol net profit EGP 8.7 million versus EGP 8.4 million year ago Source: ( bit.ly/2etVQAu ) Further company coverage: Next In Financials', '2016-11-13T15:38:41.407+02:00', '2018-02-10 13:43:39.521661', '2018-02-10 13:43:39.521686', "['briefal', 'tawfeek', 'co', 'financ', 'lea', 'q', 'profit', 'rise']", '', '', '', '', '', '', 'webhose', '', '', '', '']
Number of articles left: 868346
Reliable articles: 573725
Fake articles: 294621
Ratio of fakes: 0.33928986832437763
Ratio of reliable: 0.6607101316756224


In [12]:
fakeCount, realCount

(294621, 573725)

### Export

In [18]:
df.to_csv('data_cleaned_fr.csv', index=False)

In [None]:
test = pd.read_csv('data_cleaned_fr.csv')

In [17]:
df.head()

Unnamed: 0.1,Unnamed: 1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,0,732,7444726.0,nationalreview.com,reliable,http://www.nationalreview.com/node/152734/%E2%...,"['plu', 'one', 'articl', 'googl', 'plu', 'than...",2017-11-27T01:14:42.983556,2018-02-08 19:18:34.468038,2018-02-08 19:18:34.468066,Iran News Round Up,,,"['National Review', 'National Review Online', ...",,,,
1,1,1348,6213642.0,beforeitsnews.com,fake,http://beforeitsnews.com/economy/2012/06/the-c...,"['cost', 'best', 'senat', 'bank', 'committ', '...",2017-11-27T01:14:08.7454,2018-02-08 19:18:34.468038,2018-02-08 19:18:34.468066,The Cost Of The Best Senate Banking Committee ...,,,[''],,,,
2,2,7119,3867639.0,dailycurrant.com,fake,http://dailycurrant.com/2016/01/18/man-awoken-...,"['man', 'awoken', 'year', 'coma', 'commit', 's...",2017-11-27T01:14:21.395055,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,Man Awoken From 27-Year Coma Commits Suicide A...,,,[''],,,,
3,3,1518,9560791.0,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,"['julia', 'geist', 'ask', 'draw', 'pictur', 'c...",2018-02-11 00:46:42.632962,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Opening a Gateway for Girls to Enter the Compu...,,,"['Computers and the Internet', 'Women and Girl...",WHEN Julia Geist was asked to draw a picture o...,,,nytimes
4,4,9345,2059625.0,infiniteunknown.net,fake,http://www.infiniteunknown.net/2011/09/14/100-...,"['–', 'compil', 'studi', 'vaccin', 'danger', '...",2017-11-10T11:18:44.524042,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,100 Compiled Studies on Vaccine Dangers – Infi...,,,[''],,"Lymphoma, Hepatitis B, Immune System, Health, ...",,
