# Import packages

In [95]:
import os
import re
import spacy
import pandas as pd
import geonamescache
from dotenv import load_dotenv

# Load data

In [96]:
nlp = spacy.load("en_core_web_sm")

In [97]:
# Load the environment variables
load_dotenv()

# Get the paths to the data
data_dir = os.getenv("DATA_DIR")
articles_dir = os.path.join(data_dir, "articles")

# Load the main dataset
df = pd.read_csv(f"{data_dir}/article_info_V2.csv", index_col=0, parse_dates=[1])

# Load the external dataset
external_df = pd.read_csv(f"{data_dir}/other-articles.csv", index_col=0, parse_dates=[1], sep=";")

# Cleaning

In [98]:
# Drop unused columns
df.drop(columns=["Author", "Type", "Keywords"], inplace=True)

# Drop rows with empty date values
df.dropna(subset=["Date"], inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Remove the url column from the external dataset
external_df.drop(columns=["Url"], inplace=True)

# Merging datasets

In [99]:
# Add the external dataset to the main dataset
df = pd.concat([df, external_df])

# Reset the index
df.reset_index(drop=True, inplace=True)

# Clear the external dataset variable
del external_df

# Remove duplicates

In [100]:
# Count the number of entries before duplicate removal
entries_count = len(df)

# remove duplicate entries by title and date
df.drop_duplicates(subset=["Title", "Date"], keep="first", inplace=True)

# Print the number of deleted entries
print(f"Removed {entries_count - len(df)} duplicate entries.")

# Clear the entries count variable
del entries_count

Removed 3 duplicate entries.


# Parse target

In [101]:
# Define functions to parse target
def clean_string(str):
    # Make the string lowercase
    str = str.lower()
    # Remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # Remove the leading and trailing spaces
    return str.strip()

def parse_tags(str):
    # Split the string on each comma
    raw_list = str.split(",")
    # Clean every string in the list
    return list(map(clean_string, raw_list))

In [102]:
# Convert the raw string values of the Tags column to lists of strings
df["Tags"] = df["Tags"].apply(parse_tags)

In [103]:
# Clear the cleaning and parsing functions
del clean_string, parse_tags

# Load articles

In [104]:
# Article loading function
def load_article(title):
    # Create filename from title
    file_name = f"{title}.txt"
    # Get the path of the article
    file_path = os.path.join(articles_dir, file_name)
    try:
        # Attempt to read the article
        file = open(file_path, "r", encoding="utf-8")
    except:
        # If the article could not be read, return None
        return None
    # Read the contents of the article
    contents = file.read()
    # Close the file
    file.close()
    # Return the contents of the article
    return contents

In [105]:
# Load the articles
df["Contents"] = df["Title"].apply(load_article)

# Clear the article loading function
del load_article

In [106]:
# Count the number of entries in the dataframe
entries_count = len(df)

# Remove rows without article contents
df.dropna(subset=["Contents"], inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Print the number of deleted entries
print(f"Removed {entries_count - len(df)} entries without article contents.")

# Clear the entries count variable
del entries_count

Removed 42 entries without article contents.


# Create categories from tags

In [107]:
unique_tags = []
for tag_list in df["Tags"]:
    for tag in tag_list:
        if tag not in unique_tags:
            unique_tags.append(tag)

In [108]:
custom_categories = pd.read_csv(f"{data_dir}/custom-categories.csv")

def load_category(name):
    try:
        return custom_categories[name].dropna().tolist()
    except:
        return None

In [109]:
environmental_tags = load_category("Environmental crimes")
violence_tags = load_category("Violence")
weapons_tags = load_category("Weapons")
money_tags = load_category("Money")
human_trafficking_tags = load_category("Human trafficking")
drugs_tags = load_category("Drugs")

In [110]:
df["Category"] = ""

def get_most_frequent_category(environmental_count, violence_count, weapons_count, money_count, human_trafficking_count, drugs_count):
    highest_count = max(environmental_count, violence_count, weapons_count, human_trafficking_count, money_count, drugs_count)
    if highest_count == 0:
        return "Other"
    if environmental_count == highest_count:
        return "Environmental"
    elif violence_count == highest_count:
        return "Violence"
    elif weapons_count == highest_count:
        return "Weapons"
    elif human_trafficking_count == highest_count:
        return "Human trafficking"
    elif money_count == highest_count:
        return "Money"
    elif drugs_count == highest_count:
        return "Drugs"

for current_article_index in range(len(df)):
    tag_list = df["Tags"][current_article_index]

    environmental_count = 0
    violence_count = 0
    weapons_count = 0
    human_trafficking_count = 0
    money_count = 0
    drugs_count = 0

    for tag in tag_list:
        if tag in environmental_tags:
            environmental_count += 1
        elif tag in violence_tags:
            violence_count += 1
        elif tag in weapons_tags:
            weapons_count += 1
        elif tag in human_trafficking_tags:
            human_trafficking_count += 1
        elif tag in drugs_tags:
            drugs_count += 1
        elif tag in money_tags:
            money_count += 1

    df["Category"][current_article_index] = get_most_frequent_category(environmental_count, violence_count, weapons_count, money_count, human_trafficking_count, drugs_count)

In [111]:
df["Category"].value_counts()

Other                3086
Drugs                2106
Violence             1843
Environmental        1449
Weapons               705
Human trafficking     509
Money                 470
Name: Category, dtype: int64

In [113]:
# remove location, environmental, violence, weapons, money, human trafficking and drugs tags from the list of unique tags
remaining_tags = [tag for tag in unique_tags if tag not in environmental_tags and tag not in violence_tags and tag not in weapons_tags and tag not in money_tags and tag not in human_trafficking_tags and tag not in drugs_tags]
remaining_tags

print(f"{len(remaining_tags)} / {len(unique_tags)} unique tags have not been categorized yet.")

print(remaining_tags)

608 / 717 unique tags have not been categorized yet.
['zetas', 'haiti', 'el salvador', 'bolivia', 'judicial reform', 'elites and crime', 'invisibles', 'while scrambling to save himself from brazilian authorities', 'leones', 'don berna', 'colombia', 'mexico personalities', 'los machos', 'chepe luna', 'mexico', 'security policy', 'mi sangre', 'el chango', 'honduras', 'merida initiative', 'usmexico border', 'brazil', 'el paisa', 'guatemala personalities', 'juarez cartel', 'red scorpions', 'jamaica', 'canada', 'ndrangheta', 'sinaloa cartel', 'nicaragua', 'oficina de envigado', 'caribbean', 'los monos', 'g9', 'red command', 'haiti groups', 'peru', 'colombia groups', 'sebastian paisas', 'pijarbey', 'colombia mafia', 'pccbolivia', 'second marquetalia', 'iván márquez', 'venezuela', 'exfarc mafia', 'los caparrapos', 'el chayo', 'amigos dos amigos', 'usa', 'el salvador groups', 'valenciano', 'carsi', 'guatemala', 'ayotzinapa', 'farc', 'el barney', 'venezuela groups', 'displacement', 'el chapo', 