In [17]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.abspath(".."))

from utils.db import fetch_sanctions

In [None]:
df = fetch_sanctions()
df.head()

In [None]:
df.info()
df.describe(include="all")
df.isnull().sum()

In [None]:
# Sanity check for uniqueness
df['ent_num'].nunique(), df.duplicated().sum()

In [None]:
sns.countplot(data=df, y='country', order=df['country'].value_counts().head(10).index)
plt.title("Top 10 Countries in Sactions List")
plt.show()

In [None]:
df['sdn_type'].value_counts().plot(kind='barh', title="SDN Types Distribution")

In [49]:
import re

def standardize_name(name: str) -> str:
    if pd.isnull(name):
        return ''
    name = re.sub(r"[/-]", " ", name).upper()
    name = re.sub(r"[^A-Z\s]", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

df['cleaned_name'] = df['sdn_name'].apply(standardize_name)

In [None]:
df['name_length'] = df['cleaned_name'].str.len()
df['word_count'] = df['cleaned_name'].str.split().apply(len)
df['has_country_in_name'] = df.apply(
    lambda row: 1 if row['country'] and row['country'].upper() in row['cleaned_name'] else 0,
    axis=1
)

df.head(10)

In [54]:
df.to_csv('../data/sanctions_cleaned.csv', index=False)