# Standardize and filter date of articles

### Import required libraries

In [2]:
import pandas as pd

### Skynews
Convert the format of the date and filter the date of articles to be between our time frame of interest.

In [3]:
def convert_date(date_str):
    # Remove the timezone from the date string
    date_str = date_str.rsplit(',', 1)[0]  
    
    # Convert the date string to a datetime object
    date = pd.to_datetime(date_str, format='%A %d %B %Y %H:%M', errors='coerce')
    
    # Format the datetime object as a string in the format 'YYYY-MM-DD'
    return date.strftime('%Y-%m-%d')

In [9]:
# Define input and output paths
input_path = "../../../../data/raw/articles/skynews/2023_articles_skynews_raw.csv" # can be modified
output_path = "2023_articles_skynews_raw_date_filtered.csv" # can be modified

year_of_interest = 2023 # can be modified
month_of_interest = 1 # can be modified

# Load the data
df = pd.read_csv(input_path, sep=';')

print("All articles:", df.shape[0])
# Clean the data
df.dropna(subset=['Date'], inplace=True)
df['Date'] = df['Date'].apply(convert_date)
df['Date'] = pd.to_datetime(df['Date'])

# Filter the data
df = df[df['Date'].dt.year == year_of_interest]
df = df[df['Date'].dt.month == month_of_interest]
print("Articles with relevant publishing date:", df.shape[0])

# Save the data
df.to_csv(output_path, sep=';')

All articles 3112
Articles with relevant publishing date 2505


### Guardian
We only have articles published in between our time frame of interest. So we just convert the format of the date.

In [21]:
# Define input and output paths
input_path = "../../../../data/raw/articles/guardian/2023_articles_guardian_raw.csv" # can be modified
output_path = "2023_articles_guardian_raw_date_filtered.csv" # can be modified

# Load the data
df = pd.read_csv(input_path, sep=';')

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Get the date part
df['Date'] = df['Date'].dt.date

df.to_csv(output_path, sep=';')