# Import packages

In [None]:
import os
import re
import pandas as pd
import spacy
import sklearn
from dotenv import load_dotenv

# Load data

In [None]:
# Load spacy language model
nlp = spacy.load('en_core_web_lg')

In [None]:
# Load the environment variables
load_dotenv()

# Get the paths to the data
data_dir = os.getenv("DATA_DIR")
articles_dir = os.path.join(data_dir, "articles")

# Load the main dataset
df = pd.read_csv(f"{data_dir}/article_info_V2.csv", index_col=0, parse_dates=[1])

# Load the external dataset
external_df = pd.read_csv(f"{data_dir}/other-articles.csv", index_col=0, parse_dates=[1], sep=";")

# Clear the data directory variable
del data_dir

# Cleaning

In [None]:
# Drop unused columns
df.drop(columns=["Author", "Type", "Keywords"], inplace=True)

# Drop rows with empty date values
df.dropna(subset=["Date"], inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Remove the url column from the external dataset
external_df.drop(columns=["Url"], inplace=True)

# Merging datasets

In [None]:
# Add the external dataset to the main dataset
df = pd.concat([df, external_df])

# Reset the index
df.reset_index(drop=True, inplace=True)

# Clear the external dataset variable
del external_df

# Remove duplicates

In [None]:
# Count the number of entries before duplicate removal
entries_count = len(df)

# remove duplicate entries by title and date
df.drop_duplicates(subset=["Title", "Date"], keep="first", inplace=True)

# Print the number of deleted entries
print(f"Removed {entries_count - len(df)} duplicate entries.")

# Clear the entries count variable
del entries_count

# Parse target

In [None]:
# Define functions to parse target
def clean_string(str):
    # Make the string lowercase
    str = str.lower()
    # Remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # Remove the leading and trailing spaces
    return str.strip()

def parse_tags(str):
    # Split the string on each comma
    raw_list = str.split(",")
    # Clean every string in the list
    return list(map(clean_string, raw_list))

In [None]:
# Convert the raw string values of the Tags column to lists of strings
df["Tags"] = df["Tags"].apply(parse_tags)

In [None]:
# Clear the cleaning and parsing functions
del clean_string, parse_tags

# Load articles

In [None]:
# Article loading function
def load_article(title):
    # Create filename from title
    file_name = f"{title}.txt"
    # Get the path of the article
    file_path = os.path.join(articles_dir, file_name)
    try:
        # Attempt to read the article
        file = open(file_path, "r", encoding="utf-8")
    except:
        # If the article could not be read, return None
        return None
    # Read the contents of the article
    contents = file.read()
    # Close the file
    file.close()
    # Return the contents of the article
    return contents

In [None]:
# Load the articles
df["Contents"] = df["Title"].apply(load_article)

# Clear the article loading function
del load_article

In [None]:
# Count the number of entries in the dataframe
entries_count = len(df)

# Remove rows without article contents
df.dropna(subset=["Contents"], inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Print the number of deleted entries
print(f"Removed {entries_count - len(df)} entries without article contents.")

# Clear the entries count variable
del entries_count

# SpaCy time

In [None]:
doc = nlp(df["Contents"][0])

named_entities = doc.ents

print(named_entities)
    