<a href="https://colab.research.google.com/github/NataKrj/AI-project-2024/blob/main/Step_1_Step_2_Step_3_Step_4_sanction_list.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install google-search-results
!pip install requests beautifulsoup4
!pip install pandas
!pip install fuzzywuzzy
!pip install selenium
!pip install spacy nltk
!python -m spacy download en_core_web_sm
!pip install rapidfuzz
!pip install swifter

Collecting google-search-results
  Downloading google_search_results-2.4.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: google-search-results
  Building wheel for google-search-results (setup.py) ... [?25l[?25hdone
  Created wheel for google-search-results: filename=google_search_results-2.4.2-py3-none-any.whl size=32009 sha256=f2fb12d7fd15cefe77e8eff357e55a97395d1c113bd0732c2b8b1535c6ecc731
  Stored in directory: /root/.cache/pip/wheels/d3/b2/c3/03302d12bb44a2cdff3c9371f31b72c0c4e84b8d2285eeac53
Successfully built google-search-results
Installing collected packages: google-search-results
Successfully installed google-search-results-2.4.2
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting selenium
  Downloading selenium-4.27.1-py3

# I. Step -Sanction list check

In [None]:
import pandas as pd

# URLs for the CSV files
url1 = 'https://www.treasury.gov/ofac/downloads/sdn.csv'
url2 = 'https://www.treasury.gov/ofac/downloads/consolidated/cons_alt.csv'

# Read data from the first CSV file
df1 = pd.read_csv(url1, on_bad_lines='skip')
sanction_list_url1 = df1.iloc[:, 1].dropna().unique()

# Read data from the second CSV file
df2 = pd.read_csv(url2, on_bad_lines='skip')
sanction_list_url2 = df2.iloc[:, 3].dropna().unique()

# Combine the names from both CSV files
sanction_list = list(set(sanction_list_url1) | set(sanction_list_url2))

# Create a DataFrame from the combined names
sanction_list_df = pd.DataFrame({'Sanctioned Names': sanction_list})

# Save the DataFrame to a CSV file
sanction_list_df.to_csv('sanction_list.csv', index=False)

print("Sanctioned names saved to 'sanction_list.csv'")

Sanctioned names saved to 'sanction_list.csv'


In [None]:
import pandas as pd
from fuzzywuzzy import fuzz

# Paths to the files
uploaded_file_path = 'BELGIUM_companies.csv'
sanction_list_file_path = 'sanction_list.csv'

# Load the companies file
companies_df = pd.read_csv(uploaded_file_path, low_memory=False, encoding='utf-8')

# Limit to first 500 companies
companies_df = companies_df.head(500)

# Load the sanction list
sanction_list_df = pd.read_csv(sanction_list_file_path)

# Normalize the sanction list for case-insensitive matching
sanction_list_df['Sanctioned Names'] = sanction_list_df['Sanctioned Names'].str.lower()

# Function for exact matching only
def exact_match(name, sanction_list_df):
    name = name.lower()
    if name in sanction_list_df['Sanctioned Names'].values:
        return name, 30  # 100% match
    return "No Match", 0  # No match

# Evaluate if company names exactly match any name in the sanction list
results = []
for company in companies_df['OriginalCompanyName']:
    matched_name, score = exact_match(company, sanction_list_df)
    results.append({'OriginalCompanyName': company, 'Sanctioned Names': matched_name, 'Score': score})

results_df = pd.DataFrame(results)

# Save the updated new file in CSV format
output_file_path = 'Step_1_evaluated_companies.csv'
results_df.to_csv(output_file_path, index=False)

print(f"Evaluation complete with exact matching. Results saved to {output_file_path}")

Evaluation complete with exact matching. Results saved to Step_1_evaluated_companies.csv




# II STEP- Company status/ active check

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import re
from datetime import datetime
from multiprocessing import Pool
import numpy as np
from time import sleep
import os

# Define scoring dictionary
status_scores = {
    "ENT LP Active": 1,
    "ENT LP Stopped": 5,
    "error": 2,
    "EU Active": 1,
    "EU Stopped": 5,
    "No result found for this search term": 2,
    "not found in KBO data table": 2
}

# Web scraping function with Selenium
def process_companies(company_chunk):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    wait = WebDriverWait(driver, 45)

    base_url = "https://kbopub.economie.fgov.be/kbopub/zoeknaamfonetischform.html?lang=en"
    result_chunk = []
    successful_count = 0

    company_types = [
        "VZW", "BVBA", "BV", "NV", "CV", "CVBA", "SPRL", "SCRL", "ASBL",
        "Comm.V", "SComm", "VOF", "SNC", "GIE", "AIE", "SE", "Partnership"
    ]

    def clean_company_name(company_name):
        return re.sub(r'\b(?:' + '|'.join(company_types) + r')\b', '', company_name, flags=re.IGNORECASE).strip()

    sleep_time = 15
    for company_name in company_chunk:
        try:
            clean_name = clean_company_name(company_name)
            driver.get(base_url)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            sleep(sleep_time)
            search_box = wait.until(EC.presence_of_element_located((By.ID, "searchWord")))
            search_box.clear()
            search_box.send_keys(clean_name)

            checkbox = driver.find_element(By.ID, "filterEnkelActieve")
            if checkbox.is_selected():
                checkbox.click()

            search_button = wait.until(EC.element_to_be_clickable((By.NAME, "actionNPRP")))
            search_button.click()
            wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

            try:
                page_text = driver.find_element(By.TAG_NAME, "body").text
                if "no result found for this search term." in page_text.lower():
                    result_chunk.append({
                        'OriginalCompanyName': company_name,
                        'CleanedCompanyName': clean_name,
                        'Status': "No result found for this search term",
                        'Timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    })
                    continue
            except NoSuchElementException:
                pass

            rows = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#onderneminglistfonetisch tbody tr')))
            status = "not found in KBO data table"
            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            for row in rows:
                name_cell = row.find_element(By.CLASS_NAME, 'benaming').text.strip()
                if name_cell.lower() == clean_name.lower():
                    status_cell = row.find_elements(By.TAG_NAME, 'td')[1].text.strip()
                    status = re.sub(r'\s+', ' ', status_cell).strip()
                    successful_count += 1
                    break

            result_chunk.append({
                'OriginalCompanyName': company_name,
                'CleanedCompanyName': clean_name,
                'Status': status,
                'Timestamp': timestamp
            })

        except (NoSuchElementException, TimeoutException, Exception):
            result_chunk.append({
                'OriginalCompanyName': company_name,
                'CleanedCompanyName': clean_name,
                'Status': "error",
                'Timestamp': "N/A"
            })

    driver.quit()
    return result_chunk, successful_count

if __name__ == '__main__':
    start_time = datetime.now()
    company_list = pd.read_csv('BELGIUM_companies.csv', encoding='latin-1', sep=',')['OriginalCompanyName'].str.lower()[0:500].tolist()
    num_workers = 5

    company_chunks = np.array_split(company_list, num_workers)
    with Pool(num_workers) as pool:
        results = pool.map(process_companies, company_chunks)

    all_results = [item[0] for item in results]
    successful_count = sum(item[1] for item in results)
    result_df = pd.DataFrame([item for sublist in all_results for item in sublist])
    result_df['Score'] = result_df['Status'].map(status_scores).fillna(0)

    result_df.to_csv('Step_2_company_status_report_with_scores.csv', index=False)

    end_time = datetime.now()
    print(f"End time: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Total time taken: {end_time - start_time}")
    print(f"Total successfully found statuses: {successful_count}")

End time: 2024-12-25 11:30:42
Total time taken: 0:25:24.790498
Total successfully found statuses: 126


# III STEP- web scraping

## 3.1 STEP - web scraping using API

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import re
from googleapiclient.discovery import build
from requests.exceptions import RequestException, SSLError

# Set up your API keys
google_api_key = 'xxx'  # Replace with your Google API key
google_cse_id = 'xxx'  # Replace with your Custom Search Engine ID

# Load the CSV file to get company names
df = pd.read_csv('BELGIUM_companies.csv', low_memory=False, encoding='utf-8')
company_names = df['name'][0:500].tolist()  # Processing the first 500 companies

# Keywords and associated scores
keywords_score_30 = [
    "sanctions", "criminal", "crime", "corruption", "shell company", "criminal case", "arrested"
]
keywords_score_5 = [
    "court", "accusation", "penalty", "investigation", "insolvency", "violation", "debt", "blackmail"
]
keywords_score_minus_1 = ["stock"]  # Negative scoring for "stock"

score_no_words = 0

def google_search(search_term, api_key, cse_id, start_index=1):
    service = build("customsearch", "v1", developerKey=api_key)
    try:
        res = service.cse().list(q=search_term, cx=cse_id, start=start_index).execute()
        return res.get('items', [])
    except Exception as e:
        print(f"Failed to search for {search_term} with error: {e}")
        return []

def extract_text_from_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    try:
        response = requests.get(url, headers=headers, verify=False, timeout=10)
        if response.status_code == 200:
            if 'text/html' in response.headers.get('Content-Type', ''):
                soup = BeautifulSoup(response.text, 'html.parser')
                for script in soup(["script", "style", "header", "footer", "form", "nav"]):
                    script.extract()
                for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
                    comment.extract()
                text = ' '.join(soup.stripped_strings)
                return text
            else:
                return "Non-text content skipped"
        else:
            return ""
    except (RequestException, SSLError) as e:
        return f"Request failed for {url}: {e}"

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    text = text.strip()
    return text

def calculate_score(text):
    """
    Determine the score for a given text based on keyword matching.
    """
    text_lower = text.lower()
    if any(keyword in text_lower for keyword in keywords_score_30):
        return 30
    elif any(keyword in text_lower for keyword in keywords_score_5):
        return 5
    elif any(keyword in text_lower for keyword in keywords_score_minus_1):
        return -1
    elif text.strip() == "":
        return score_no_words
    else:
        return score_no_words

def process_company(company_name):
    results = google_search(company_name, google_api_key, google_cse_id)
    company_data = []
    for result in results:
        url = result['link']
        extracted_text = extract_text_from_url(url)
        if extracted_text != "Non-text content skipped":
            extracted_text = clean_text(extracted_text)
            score = calculate_score(extracted_text)
            company_data.append({
                'company': company_name,
                'url': url,
                'extracted_text': extracted_text,
                'score': score
            })
        else:
            company_data.append({
                'company': company_name,
                'url': url,
                'extracted_text': "Skipped due to non-text content",
                'score': score_no_words
            })
    return company_data

# Use ThreadPoolExecutor to process companies in parallel
data = []
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_company, name): name for name in company_names}
    for future in as_completed(futures):
        data.extend(future.result())

# Convert list of dicts to DataFrame
df_results = pd.DataFrame(data)

# Save the DataFrame to a CSV file with proper encoding and escaping
output_file_path = 'Step_3.1_company_analysis_with_scores.csv'  # Replace with desired file path
df_results.to_csv(output_file_path, index=False, escapechar='\\', encoding='utf-8', quoting=csv.QUOTE_ALL)

print(f"Data saved to {output_file_path}.")

## 3.2 STEP - web scraping using BeautifulSoup

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import re
from requests.exceptions import RequestException, SSLError
import time
from urllib.parse import quote
import random
import spacy
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# Load the NLP model
nlp = spacy.load('en_core_web_sm')
sia = SentimentIntensityAnalyzer()

# Load the file
file_path = 'BELGIUM_companies.csv'
df = pd.read_csv(file_path, encoding = 'utf-8' , low_memory=False)
company_names = df['OriginalCompanyName'][0:500].tolist()


# Keywords and associated scores
keywords_score_30 = [
    "sanctions", "criminal", "crime", "corruption", "shell company", "offshore",
    "criminal case", "arrested", "fraud", "money laundering",
    "embezzlement", "terrorism financing", "bribery", "tax evasion",
    "illicit funds", "smuggling", "seized assets", "fines",
    "indictment", "prosecuted", "wanted", "scam", "scandal"
]

keywords_score_5 = [
    "court", "accusation", "penalty", "investigation",
    "insolvency", "violation", "debt", "blackmail", "lawsuit",
    "default", "litigation", "settlement", "audit", "suspicious",
    "foreclosure", "dispute", "breach", "illegal transaction",
    "arbitration", "compliance failure", "tax fraud"
]

keywords_score_minus_1 = ["stock"]  # Negative scoring

score_no_words = 0

# User-Agent Rotation
user_agents = [

    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)",
    "Mozilla/5.0 (iPad; CPU OS 13_2 like Mac OS X)"
]

def random_headers():
    return {
        "User-Agent": random.choice(user_agents)
    }

# Domains to Exclude
exclude_domains = [
    'dictionary.com', 'wiktionary.org', 'merriam-webster.com',
    'facebook.com', 'twitter.com', 'vimeo.com', 'youtube.com',
    'linkedin.com', 'reddit.com', 'quora.com', 'instagram.com',
    'tiktok.com', 'pinterest.com', 'justia.com'
]

# Keywords to detect dictionary-related content
dictionary_keywords = [
    "definition", "meaning", "dictionary", "thesaurus", "pronunciation"
]

def is_valid_url(url):
    return not any(domain in url for domain in exclude_domains)

# Bing Search Scraper
def bing_search_scrape(company, retries=3, backoff_factor=5):
    query = f'"{company}"'
    url = f"https://www.bing.com/search?q={quote(query)}"

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=random_headers(), timeout=10)

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                results = []

                for g in soup.find_all('li', class_='b_algo'):
                    link_tag = g.find('a')
                    if not link_tag or 'href' not in link_tag.attrs:
                        continue

                    link = link_tag['href']
                    title = g.find('h2').text if g.find('h2') else ""
                    snippet = g.find('p').text if g.find('p') else ""

                    if not is_valid_url(link):
                        continue

                    if title or snippet:
                        results.append({
                            "title": title,
                            "link": link,
                            "snippet": snippet
                        })

                # Return results if found
                if len(results) > 0:
                    return results
                else:
                    print(f"No results for {company}, retrying...")

            else:
                print(f"Failed to fetch results for {company} (Attempt {attempt+1}): {response.status_code}")

        except (RequestException, SSLError) as e:
            print(f"Error scraping Bing for {company} (Attempt {attempt+1}): {e}")

        # Exponential backoff before next retry
        time.sleep((attempt + 1) * backoff_factor + random.uniform(2, 5))

    # Return empty if all retries fail
    return []

# Function to check if the content is likely from a dictionary
def is_dictionary_page(soup):
    # Check title
    if soup.title and any(word in soup.title.text.lower() for word in dictionary_keywords):
        return True

    # Check meta description
    meta_description = soup.find("meta", {"name": "description"})
    if meta_description and any(word in meta_description.get("content", "").lower() for word in dictionary_keywords):
        return True

    # Check h1 or h2 headings
    for tag in soup.find_all(['h1', 'h2']):
        if any(word in tag.text.lower() for word in dictionary_keywords):
            return True

    return False


# Extract Relevant Text from URL (Updated)
def extract_text_from_url(url, company_name):
    try:
        response = requests.get(url, headers=random_headers(), timeout=10)
        if response.status_code == 200 and 'text/html' in response.headers.get('Content-Type', ''):
            soup = BeautifulSoup(response.text, 'html.parser')

            # Filter out dictionary pages
            if is_dictionary_page(soup):
                return "Dictionary content skipped"

            # Clean unwanted sections
            for script in soup(["script", "style", "header", "footer", "form", "nav"]):
                script.extract()
            for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
                comment.extract()

            # Extract relevant text
            relevant_text = ""
            for tag in soup.find_all(['h1', 'h2', 'h3', 'p']):
                if company_name.lower() in tag.text.lower():
                    relevant_text += tag.text + " "

            if not relevant_text:
                relevant_text = ' '.join(soup.stripped_strings)

            return re.sub(r'\s+', ' ', relevant_text[:2000])
        else:
            return "Non-text content skipped"
    except (RequestException, SSLError) as e:
        return f"Request failed for {url}: {e}"

def analyze_text_with_nlp(text):
    doc = nlp(text)

    # Named Entity Recognition (NER)
    entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'PERSON', 'GPE', 'LAW']]

    # Sentiment Analysis
    sentiment_score = sia.polarity_scores(text)['compound']
    sentiment = 'Positive' if sentiment_score > 0.05 else 'Negative' if sentiment_score < -0.05 else 'Neutral'

    return entities, sentiment, sentiment_score

def calculate_score_with_reason(text, snippet, company_name):
    text_lower = text.lower()
    snippet_lower = snippet.lower()
    matching_keywords = []
    score = score_no_words

    # Perform NLP Analysis
    entities, sentiment, sentiment_score = analyze_text_with_nlp(text)

    # Direct sentence match for precision
    def keyword_in_same_sentence(keyword):
        sentences = re.split(r'[.!?]', text_lower)
        for sentence in sentences:
            if company_name.lower() in sentence and keyword in sentence:
                return True
        return False

    # Proximity match for broader detection
    def keyword_near_company(keyword):
        match = re.search(rf"\b{keyword}\b", text_lower)
        if match:
            window = text_lower[max(0, match.start() - 500):match.end() + 500]
            if company_name.lower() in window:
                return True
        return False

    # High-risk keyword scoring
    for keyword in keywords_score_30:
        if keyword_in_same_sentence(keyword):
            matching_keywords.append(keyword)
            score += 30
        elif keyword_near_company(keyword) or keyword in snippet_lower:
            matching_keywords.append(keyword)
            score += 30

    # Medium-risk keyword scoring
    for keyword in keywords_score_5:
        if keyword_in_same_sentence(keyword):
            matching_keywords.append(keyword)
            score += 5
        elif keyword_near_company(keyword) or keyword in snippet_lower:
            matching_keywords.append(keyword)
            score += 5

    # Negative keywords (reduce score)
    for keyword in keywords_score_minus_1:
        if keyword_in_same_sentence(keyword):
            matching_keywords.append(keyword)
            score -= 1
        elif keyword_near_company(keyword) or keyword in snippet_lower:
            matching_keywords.append(keyword)
            score -= 1

    # Boost for financial distress mentions
    if not matching_keywords:
       for term in ["insolvency", "bankruptcy", "liquidation", "dissolved"]:
        if term in text_lower:
            score += 5
            matching_keywords.append(term)

    # Sentiment-based Adjustment
    if sentiment == 'Negative':
        score += 5  # Increase score for negative sentiment
    elif sentiment == 'Positive':
        score = 0  # Reduce score slightly for positive sentiment

    return score, matching_keywords or ["No relevant keywords"], entities, sentiment

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

# Process Each Company
def process_company(company_name):
    try:
        results = bing_search_scrape(company_name)
        company_data = []

        # If no results, append a placeholder entry with a score of 0
        if not results:
            return [{
                'company': company_name,
                'url': "N/A",
                'extracted_text': "No results found",
                'score': 0,
                'matched_keywords': "No relevant keywords",
                'entities': "N/A",
                'sentiment': "N/A"
            }]

        # Process results if found
        for result in results:
            url = result['link']
            snippet = result['snippet']
            extracted_text = extract_text_from_url(url, company_name)

            if extracted_text != "Non-text content skipped":
                extracted_text = clean_text(extracted_text)
                score, reasons, entities, sentiment = calculate_score_with_reason(
                    extracted_text, snippet, company_name
                )
                company_data.append({
                    'company': company_name,
                    'url': url,
                    'extracted_text': extracted_text[:300],
                    'score': score,
                    'matched_keywords': ', '.join(reasons),
                    'entities': ', '.join(entities),
                    'sentiment': sentiment
                })

        # If no valid results (filtered by dictionary check), add a placeholder
        if not company_data:
            company_data.append({
                'company': company_name,
                'url': "N/A",
                'extracted_text': "No valid results",
                'score': 0,
                'matched_keywords': "No relevant keywords",
                'entities': "N/A",
                'sentiment': "N/A"
            })

        return company_data

    except Exception:
        # Return error status with a score of 5
        return [{
            "company": company_name,
            "url": "N/A",
            "extracted_text": "Error occurred",
            "score": 0,
            "matched_keywords": "Error",
            "entities": "N/A",
            "sentiment": "N/A"
        }]

# Multithreading for Efficiency (Batching in groups of 50)
data = []
batch_size = 50
total_companies_processed = 0  # Track overall processed companies

for i in range(0, len(company_names), batch_size):
    batch = company_names[i:i + batch_size]
    batch_results = []

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(process_company, name): name for name in batch}
        for future in as_completed(futures):
            try:
                results = future.result()
                batch_results.extend(results)
            except Exception as e:
                print(f"Error during processing batch: {e}")

    # Extend data with the batch results
    data.extend(batch_results)

# Track unique companies
unique_companies_processed = set()

for i in range(0, len(company_names), batch_size):
    batch = company_names[i:i + batch_size]
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(process_company, name): name for name in batch}
        for future in as_completed(futures):
            batch_results = future.result()
            for entry in batch_results:
                unique_companies_processed.add(entry['company'])  # Add company name to the set

    # Count processed companies for the current batch
    companies_in_batch = len(batch_results)
    total_companies_processed += companies_in_batch

    print(f"Batch {i // batch_size + 1} completed. {companies_in_batch} companies processed in this batch.")

    # Delay between batches to prevent blocking
    if i + batch_size < len(company_names):
        wait_time = random.randint(45, 75)
        print(f"Waiting {wait_time} seconds before next batch...")
        time.sleep(wait_time)

# Final count of all processed companies
print(f"\nTotal unique companies processed: {len(unique_companies_processed)}")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


No results for BS Parts, retrying...
No results for International Lashing Systems nv [INTERNATIONAL LASHING SYSTEMS - MARITIME CONSOLIDATORS BELGIUM], retrying...
No results for International Lashing Systems nv [INTERNATIONAL LASHING SYSTEMS - MARITIME CONSOLIDATORS BELGIUM], retrying...
No results for International Lashing Systems nv [INTERNATIONAL LASHING SYSTEMS - MARITIME CONSOLIDATORS BELGIUM], retrying...
No results for JGI-Computers, retrying...
No results for Lorsom/Marc, retrying...
No results for Lorsom/Marc, retrying...
No results for Lorsom/Marc, retrying...
No results for Oh ! La ! La ! Boulangerie [OH] LA] LA] BOULANGERIE SPRL], retrying...
No results for Oh ! La ! La ! Boulangerie [OH] LA] LA] BOULANGERIE SPRL], retrying...
No results for Dumortier/Marc, retrying...
No results for Oh ! La ! La ! Boulangerie [OH] LA] LA] BOULANGERIE SPRL], retrying...
No results for Delvith [DELVITH AG], retrying...
No results for Delvith [DELVITH AG], retrying...
No results for Delvith [


# IV STEP- companies scoring



In [5]:
import pandas as pd

# 1. Load the Data Files with Correct Encoding
step_1 = pd.read_csv('Step_1_evaluated_companies.csv', encoding='utf-8')
step_2 = pd.read_csv('Step_2_company_status_report_with_scores.csv', encoding='utf-8')
step_3 = pd.read_csv('Step_3.2_company_analysis_with_scores.csv', encoding='utf-8')

# 2. Standardize Company Name Columns
step_1.rename(columns={'OriginalCompanyName': 'company_name'}, inplace=True)
step_2.rename(columns={'OriginalCompanyName': 'company_name'}, inplace=True)
step_3.rename(columns={'company': 'company_name', 'score': 'Score'}, inplace=True)

# Normalize company names to lowercase to avoid duplicates
step_1['company_name'] = step_1['company_name'].str.lower()
step_2['company_name'] = step_2['company_name'].str.lower()
step_3['company_name'] = step_3['company_name'].str.lower()

# 3. Aggregate Step 3 to Avoid Duplicates
step_3_filtered = step_3[step_3['Score'] > 0]
step_3_aggregated = step_3_filtered.groupby('company_name', as_index=False).agg({
    'matched_keywords': lambda x: ', '.join(x.dropna().unique()),
    'Score': 'sum'
})

# 4. Filter for Companies with Matches in Any Step
matched_companies = pd.concat([
    step_1[['company_name']],
    step_2[['company_name']],
    step_3_aggregated[['company_name']]
]).drop_duplicates()

# 5. Merge Step 1 and Step 2
merged_1 = pd.merge(matched_companies,
                    step_1[['company_name', 'Sanctioned Names', 'Score']],
                    on='company_name', how='left')

merged_2 = pd.merge(merged_1,
                    step_2[['company_name', 'Status', 'Score']],
                    on='company_name', how='left')

# 6. Merge Aggregated Step 3 (keywords and scores)
merged_final = pd.merge(merged_2,
                        step_3_aggregated[['company_name', 'matched_keywords', 'Score']],
                        on='company_name', how='left')

# 7. Calculate Total Score by Summing Across Steps
merged_final['Total_Score'] = merged_final[['Score_x', 'Score_y', 'Score']].sum(axis=1)

# 8. Aggregate Again to Remove Duplicates
merged_final = merged_final.groupby('company_name', as_index=False).agg({
    'Sanctioned Names': 'first',
    'Status': 'first',
    'matched_keywords': lambda x: ', '.join(x.dropna().unique()),
    'Total_Score': 'sum'
})

# 9. Risk Level Calculation
def assign_risk_level(score):
    if score > 30:
        return 'prohibited'
    elif 7 <= score <= 30:
        return 'high'
    elif 1 <= score <= 6:
        return 'medium'
    else:
        return 'low'

merged_final['risk_level'] = merged_final['Total_Score'].apply(assign_risk_level)

# 10. Save Results with UTF-8 Encoding to Preserve Special Characters
merged_final[['company_name', 'Sanctioned Names', 'Status', 'matched_keywords', 'Total_Score', 'risk_level']].to_csv(
    'Step_4.1_company_risk_scores.csv', index=False, encoding='utf-8'
)

merged_final[['company_name', 'Sanctioned Names', 'Status', 'matched_keywords', 'Total_Score', 'risk_level']].head()

Unnamed: 0,company_name,Sanctioned Names,Status,matched_keywords,Total_Score,risk_level
0,9g,No Match,error,,2.0,medium
1,a.b.c. kickers bvba,,ENT LP Stopped,No relevant keywords,10.0,high
2,a.d. froidmont,,EU Stopped,No relevant keywords,10.0,high
3,a.f. security,No Match,EU Stopped,No relevant keywords,10.0,high
4,a.f.m. new sa,No Match,error,,2.0,medium


In [6]:
import pandas as pd
from rapidfuzz import process, fuzz
import swifter

# 1. Load the Data Files with Correct Encoding
step_1 = pd.read_csv('Step_1_evaluated_companies.csv', encoding='utf-8')
step_2 = pd.read_csv('Step_2_company_status_report_with_scores.csv', encoding='utf-8')
step_3 = pd.read_csv('Step_3.2_company_analysis_with_scores.csv', encoding='utf-8')
belgium_companies = pd.read_csv('BELGIUM_companies.csv', encoding='utf-8')

# 2. Standardize Company Name Columns
step_1.rename(columns={'OriginalCompanyName': 'company_name'}, inplace=True)
step_2.rename(columns={'OriginalCompanyName': 'company_name'}, inplace=True)
step_3.rename(columns={'company': 'company_name', 'score': 'Score'}, inplace=True)
belgium_companies.rename(columns={'OriginalCompanyName': 'company_name'}, inplace=True)

# Normalize company names to lowercase to avoid duplicates
for df in [step_1, step_2, step_3, belgium_companies]:
    df['company_name'] = df['company_name'].str.lower()

# 3. Aggregate Step 3 to Avoid Duplicates
step_3_filtered = step_3[step_3['Score'] > 0]
step_3_aggregated = step_3_filtered.groupby('company_name', as_index=False).agg({
    'matched_keywords': lambda x: ', '.join(x.dropna().unique()),
    'Score': 'sum'
})

# 4. Fuzzy Matching to Align Company Names
unmatched = []

def fuzzy_match_rapidfuzz(company_name, choices, threshold=85):
    result = process.extractOne(company_name, choices, scorer=fuzz.WRatio, score_cutoff=threshold)  # Added score_cutoff
    if result:  # Check if result is not None or empty
        match, score, _ = result  # Unpack with _ for the extra index
        return match
    unmatched.append(company_name)
    return company_name

choices = belgium_companies['company_name'].tolist()

step_1['company_name'] = step_1['company_name'].swifter.apply(lambda x: fuzzy_match_rapidfuzz(x, choices))
step_2['company_name'] = step_2['company_name'].swifter.apply(lambda x: fuzzy_match_rapidfuzz(x, choices))
step_3_aggregated['company_name'] = step_3_aggregated['company_name'].swifter.apply(lambda x: fuzzy_match_rapidfuzz(x, choices))

# 5. Filter for Companies with Matches in Any Step
matched_companies = pd.concat([
    step_1[['company_name']],
    step_2[['company_name']],
    step_3_aggregated[['company_name']]
]).drop_duplicates()

# 6. Merge Step 1 and Step 2
merged_1 = pd.merge(matched_companies,
                    step_1[['company_name', 'Sanctioned Names', 'Score']],
                    on='company_name', how='left')

merged_2 = pd.merge(merged_1,
                    step_2[['company_name', 'Status', 'Score']],
                    on='company_name', how='left')

# 7. Merge Aggregated Step 3 (keywords and scores)
merged_final = pd.merge(merged_2,
                        step_3_aggregated[['company_name', 'matched_keywords', 'Score']],
                        on='company_name', how='left')

# 8. Calculate Total Score by Summing Across Steps
merged_final['Total_Score'] = merged_final[['Score_x', 'Score_y', 'Score']].sum(axis=1)

# 9. Aggregate Again to Remove Duplicates
merged_final = merged_final.groupby('company_name', as_index=False).agg({
    'Sanctioned Names': 'first',
    'Status': 'first',
    'matched_keywords': lambda x: ', '.join(x.dropna().unique()),
    'Total_Score': 'sum'
})

# 10. Risk Level Calculation
def assign_risk_level(score):
    if score > 30:
        return 'prohibited'
    elif 7 <= score <= 30:
        return 'high'
    elif 1 <= score <= 6:
        return 'medium'
    else:
        return 'low'

merged_final['risk_level'] = merged_final['Total_Score'].apply(assign_risk_level)

# 11. Save Results with UTF-8 Encoding to Preserve Special Characters
merged_final[['company_name', 'Sanctioned Names', 'Status', 'matched_keywords', 'Total_Score', 'risk_level']].to_csv(
    'Step_4_company_risk_scores.csv', index=False, encoding='utf-8'
)

print(f"Risk scoring completed. {len(merged_final)} companies saved.")

Pandas Apply:   0%|          | 0/300 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/500 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/437 [00:00<?, ?it/s]

Risk scoring completed. 497 companies saved.
