# Imports

In [None]:
import os
import re
import pandas as pd
from matplotlib import pyplot as plt
from dotenv import load_dotenv

# Load dataset

In [None]:
# load the environment variables from the .env file
load_dotenv()
# get the dataset path from the environment variables
dataset_path = os.environ.get("DATASET_PATH")
# load the dataset into the notebook
df = pd.read_csv(f"{dataset_path}/article_info_V2.csv", index_col=0, parse_dates=[1])

# Cleaning

## Define cleaning functions

In [None]:
def clean_string(str):
    # make the string lowercase
    str = str.lower()
    # remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # remove the leading and trailing spaces
    return str.strip()

def parse_list_from_string(str):
    # split the string on each comma
    raw_list = str.split(",")
    # clean every string in the list
    return list(map(clean_string, raw_list))

## Drop unused columns

In [None]:
# drop the author, type and keywords columns
df.drop(columns=["Author", "Type", "Keywords"], inplace=True)

## Drop empty rows

In [None]:
# remove rows with empty date values
df.dropna(subset=["Date"], inplace=True)

## Remove duplicate articles

In [None]:
# remove duplicate entries by title and date
df.drop_duplicates(subset=["Title", "Date"], keep="first", inplace=True)

## Merge datasets

In [None]:
# read the external dataset
external_df = pd.read_csv(f"{dataset_path}/other-news-articles.csv", index_col=0, parse_dates=[1])
# remove the url column from the external dataset
external_df.drop(columns=["Url"], inplace=True)
# add the external dataset to the main dataset
df = pd.concat([df, external_df])
# reset the index of the main dataset
df.reset_index(drop=True, inplace=True)
# clear the external dataset variable
del external_df

## Parse the string lists

In [None]:
# convert the raw string values of the Tags column to lists of strings
df["Tags"] = df["Tags"].apply(parse_list_from_string)