# Import dependencies

In [None]:
import os
import re
# python -m pip install pandas --user
import pandas as pd
# python -m pip install matplotlib --user
import matplotlib.pyplot as plt
# python -m pip install python-dotenv --user
from dotenv import load_dotenv

# Loading the dataset

In [None]:
# Load the environment variables from the .env file
load_dotenv()
# Get the dataset path from the environment variables
dataset_path = os.environ.get("DATASET_PATH")
# Load the dataset into the notebook
df = pd.read_csv(f"{dataset_path}/article_info_V2.csv", index_col=0, parse_dates=[1])

# Cleaning the dataset

### Define cleaning methods

In [None]:
def clean_string(str):
    # Make the string lowercase
    str = str.lower()
    # Remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # Remove the leading and trailing spaces
    return str.strip()

def parse_list_from_string(str):
    # Split the string on each comma
    raw_list = str.split(",")
    # Clean every string in the list
    return list(map(clean_string, raw_list))

### Change columns of the dataset

In [None]:
# Clean the empty values from the dataset
df.dropna()

# Replace empty strings with unknown
df["Author"].replace("", "Unknown", inplace=True)

# Clean the author strings
df["Author"] = df["Author"].apply(clean_string)

# Extract the types as a list from the string values
df["Type"] = df["Type"].apply(parse_list_from_string)

# Extract the tags as a list from the string values
df["Tags"] = df["Tags"].apply(parse_list_from_string)

# Extract the keywords as a list from the string values
df["Keywords"] = df["Keywords"].apply(parse_list_from_string)

### Show the dataframe

In [None]:
# Show the dataframe in the notebook
df.head()

# Save the dataframe to a new file

In [None]:
# Writes the file in CSV format to a new file
df.to_csv(f"{dataset_path}/article_info_V3.csv")