# Import dependencies

In [92]:
import os
import re
# python -m pip install pandas --user
import pandas as pd
# python -m pip install matplotlib --user
import matplotlib.pyplot as plt
# python -m pip install python-dotenv --user
from dotenv import load_dotenv

# Loading the dataset

In [93]:
# Load the environment variables from the .env file
load_dotenv()
# Get the dataset path from the environment variables
dataset_path = os.environ.get("DATASET_PATH")
# Load the dataset into the notebook
df = pd.read_csv(f"{dataset_path}/article_info_V2.csv", index_col=0, parse_dates=[1])

# Cleaning the dataset

### Define cleaning methods

In [94]:
def clean_string(str):
    # Make the string lowercase
    str = str.lower()
    # Remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # Remove the leading and trailing spaces
    return str.strip()

def parse_list_from_string(str):
    # Split the string on each comma
    raw_list = str.split(",")
    # Clean every string in the list
    return list(map(clean_string, raw_list))

### Change columns of the dataset

In [95]:
# Clean the empty values from the dataset
df.dropna()

# Remove rows with missing dates from the dataset
df = df[~df.Date.isna()]

# Replace empty strings with unknown
df["Author"].replace("", "Unknown", inplace=True)

# Clean the author strings
df["Author"] = df["Author"].apply(clean_string)

# Extract the types as a list from the string values
df["Type"] = df["Type"].apply(parse_list_from_string)

# Extract the tags as a list from the string values
df["Tags"] = df["Tags"].apply(parse_list_from_string)

# Extract the keywords as a list from the string values
df["Keywords"] = df["Keywords"].apply(parse_list_from_string)

### Show the dataframe

In [97]:
# Show the dataframe in the notebook
df.head()

Unnamed: 0,Date,Type,Title,Tags,Keywords,Author
0,2021-12-15 12:30:00+00:00,[news],"The Battle to Save Brazil's Freshwater Giant, ...","[brazil, coverage of illegal fishing, wildlife...","[arapaima, giant, river, freshwater, save, ton...",alessandro ford
1,2021-12-14 19:13:22+00:00,[news],Did Anti-Drug Crusade Lead to Haiti President'...,"[contraband, zetas, haiti, cocaine, el salvado...","[presidents, trafficking, haiti, according, le...",parker asmann
2,2021-12-14 14:18:15+00:00,[news],Colombian Prosecutors Dismiss Memo Fantasma Li...,"[invisibles, while scrambling to save himself ...","[prosecutors, complaint, insight, libel, aceve...",insight crime
3,2021-12-13 19:36:23+00:00,[news],Could US Anti-Violence Models Work in Latin Am...,"[chepe luna, mexico, security policy, while sc...","[mexico, program, approach, antiviolence, mode...",kai bernierchen
4,2021-12-13 14:27:23+00:00,[news],How Mexican Cartels Settled in Canada,"[brazil, el paisa, guatemala personalities, me...","[organized, group, settled, north, criminal, c...",alessandro ford


# Save the dataframe to a new file

In [98]:
# Writes the file in CSV format to a new file
df.to_csv(f"{dataset_path}/article_info_V3.csv")