In [3]:
# import src.train
# import src.evaluate
# import src.dataset

import json
import csv
import os
import glob
from tqdm import tqdm


In [4]:
# We need to import the dataset
# Make a data folder and put json folder from here: https://github.com/ramybaly/Article-Bias-Prediction/tree/main/data/jsons

# First we need to make a csv file from the json files => raw_data.csv
def convert_jsons_to_csv():
    # Fields we want to extract
    fields = ['topic', 'source', 'url', 'title', 'date', 'authors', 'content', 'source_url', 'bias_text', 'bias']
    
    # Get all JSON files in the data/jsons directory
    json_files = glob.glob('data/jsons/*.json')
    
    # Create/open CSV file to write
    with open('data/raw_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        
        # Write header
        writer.writeheader()
        
        # Process each JSON file
        for json_file in tqdm(json_files, desc="Converting JSON files to CSV"):
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
            # Create a new dict with only the fields we want
            row = {field: data.get(field, '') for field in fields}
            
            # Write the row
            writer.writerow(row)

In [5]:
# Took 10 seconds for me
convert_jsons_to_csv()

Converting JSON files to CSV: 100%|██████████| 37554/37554 [00:11<00:00, 3316.82it/s]


In [14]:
# Okay let's load the data into a pandas dataframe to see what we have
import pandas as pd

df = pd.read_csv('data/raw_data.csv')

df.head()


Unnamed: 0,topic,source,url,title,date,authors,content,source_url,bias_text,bias
0,terrorism,New York Times - News,http://www.nytimes.com/2016/09/20/nyregion/ahm...,"Bomb Suspect Changed After Trip Abroad, Friend...",2016-09-20,N. R. Kleinfield,"Besides his most recent trip to Quetta , Mr. R...",www.nytimes.com,left,0
1,supreme_court,Vox,https://www.vox.com/policy-and-politics/2018/9...,Why Susan Collins claims she’s being bribed ov...,2018-09-12,"Emily Stewart, Terry Nguyen, Rebecca Jennings,...",Is Maine Republican Sen. Susan Collins being b...,www.vox.com,left,0
2,education,Ezra Klein,http://www.npr.org/blogs/thetwo-way/2014/05/06...,Poll: Prestigious Colleges Won't Make You Happ...,2014-05-06,Anya Kamenetz,Poll : Prestigious Colleges Wo n't Make You Ha...,www.npr.org,left,0
3,us_house,Breitbart News,http://www.breitbart.com/big-government/2017/0...,Paul Ryan Reportedly Says No Chance for Border...,2017-09-12,Ian Mason,"House Speaker Paul Ryan , at a private dinner ...",www.breitbart.com,right,2
4,white_house,Guest Writer - Left,https://www.cnn.com/2019/07/11/politics/donald...,OPINION: Trump seeking change of legal fortune...,2019-07-11,Analysis Stephen Collinson,( CNN ) President Donald Trump has reason to h...,www.cnn.com,left,0


In [15]:
# We only need content and bias_text
df = df[['content', 'bias_text', 'bias']]

# Let's see if we have any null values
df.isnull().sum()


content      0
bias_text    0
bias         0
dtype: int64

In [16]:
# We will make a smaller dataset for testing
df = df.sample(n=1000, random_state=42)

# Let's save this smaller dataset
df.to_csv('data/small_data.csv', index=False)




In [None]:
# We will apply preprocessing steps to the content column (e.g remove stopwords, lemmatize, etc.)


# WIP might not be needed

In [None]:
# If you do not have a GPU download a model from me.

# If you have a okish GPU then open your terminal and run the following command

python src/main.py --mode train --data_path data/small_data.csv --batch_size 10 --learning_rate 2e-5 --epochs 10

# Then for inference you can run
python src/main.py --mode eval --model_path .\outputs\<your latest folder for the model>\best_model.pt