<a href="https://colab.research.google.com/github/NataKrj/AI-project-2024/blob/main/Web%20Scraping/Advanced_Search_Test_AI_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the uploaded CSV file to examine its structure
file_path = '/content/1_0_50_company_analysis_results.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,company,url,extracted_text,related_keywords,risk_level,risk_code
0,FORTUNEMAKER INVESTMENTS CORPORATION,https://www.forbes.com/sites/daveknox/2022/07/...,Why Brain Health Is Patrick Schwarzenegger’s L...,No match,Low Risk,0
1,FORTUNEMAKER INVESTMENTS CORPORATION,https://fortune.com/,Fortune - Fortune 500 Daily & Breaking Busines...,debt,Medium Risk,1
2,FORTUNEMAKER INVESTMENTS CORPORATION,https://foe.org/blog/2014-04-the-schwarzenegge...,The Schwarzenegger dilemma:Years of investing ...,No match,Low Risk,0
3,FORTUNEMAKER INVESTMENTS CORPORATION,https://www.inc.com/jeff-haden/arnold-schwarze...,,No match,Low Risk,0
4,FORTUNEMAKER INVESTMENTS CORPORATION,https://www.globalwitness.org/en/archive/schwa...,Schwarzenegger’s multi-million dollar ‘robo-fu...,"court, violation",Medium Risk,1


In [None]:
# Add a column to check if the company's name appears in the extracted text
data['text_relates_to_company'] = data.apply(
    lambda row: row['company'].lower() in str(row['extracted_text']).lower(), axis=1
)

# Count how many texts are related to the company and display examples
related_counts = data['text_relates_to_company'].value_counts()
examples_related = data[data['text_relates_to_company'] == True].head(5)
examples_not_related = data[data['text_relates_to_company'] == False].head(5)

related_counts, examples_related, examples_not_related


(text_relates_to_company
 False    478
 True      22
 Name: count, dtype: int64,
                             company  \
 40  SKY-BLUE GIFTS & TOYS CO., LTD.   
 42  SKY-BLUE GIFTS & TOYS CO., LTD.   
 44  SKY-BLUE GIFTS & TOYS CO., LTD.   
 46  SKY-BLUE GIFTS & TOYS CO., LTD.   
 52         KENT DEVELOPMENT LIMITED   
 
                                                   url  \
 40                        http://www.sky-blue.com.cn/   
 42         http://www.sky-blue.com.cn/en/contact.html   
 44  http://www.sky-blue.com.cn/en/product_d/id/98....   
 46  http://www.sky-blue.com.cn/en/product_d/id/104...   
 52  https://find-and-update.company-information.se...   
 
                                        extracted_text related_keywords  \
 40  SKY-BLUE Quality Toys Since 2004   Sky-Blue ...         No match   
 42  CONTACT US - SKY-BLUE Quality Toys Since 2004 ...         No match   
 44  Balancing Bike-Red - SKY-BLUE Quality Toys Sin...         No match   
 46  Wooden Train Set 50PCS

In [None]:
import spacy

# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Function to perform entity recognition and check if company name appears in entities
def analyze_text_with_nlp(row):
    if pd.isna(row['extracted_text']):
        return False
    doc = nlp(row['extracted_text'])
    entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "GPE"]]
    company_name_lower = row['company'].lower()
    return any(company_name_lower in entity for entity in entities)

# Apply the analysis to the dataset
data['nlp_analysis_relates_to_company'] = data.apply(analyze_text_with_nlp, axis=1)

# Count results and display examples
nlp_related_counts = data['nlp_analysis_relates_to_company'].value_counts()
nlp_examples_related = data[data['nlp_analysis_relates_to_company'] == True].head(5)
nlp_examples_not_related = data[data['nlp_analysis_relates_to_company'] == False].head(5)

nlp_related_counts, nlp_examples_related, nlp_examples_not_related


(nlp_analysis_relates_to_company
 False    485
 True      15
 Name: count, dtype: int64,
                         company  \
 52     KENT DEVELOPMENT LIMITED   
 104       VICTORY GROUP LIMITED   
 152      FORTUNE PALACE LIMITED   
 153      FORTUNE PALACE LIMITED   
 299  Alvear Investments Limited   
 
                                                    url  \
 52   https://find-and-update.company-information.se...   
 104                              https://victoryg.com/   
 152  https://find-and-update.company-information.se...   
 153  https://www.datocapital.vg/companies/Fortune-P...   
 299      https://offshoreleaks.icij.org/nodes/12204276   
 
                                         extracted_text  \
 52   KENT DEVELOPMENT LIMITED overview - Find and u...   
 104            Victory Group Limited home1 home2 home3   
 152  LIN'S FORTUNE PALACE LIMITED people - Find and...   
 153  Fortune Palace Investment CORP Dato Capital Ho...   
 299  STEPHEN KAMALIA MIKHAIL | ICIJ Offsh

In [None]:
# Define a set of high-risk keywords to search for in the extracted_text
high_risk_keywords = [
    "fraud", "corruption", "scam", "lawsuit", "illegal", "sanction",
    "investigation", "criminal", "violation", "money laundering", "bribe"
]

# Function to check for high-risk keywords in the text
def check_high_risk_keywords(text):
    if pd.isna(text):
        return False
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in high_risk_keywords)

# Apply the risk check to the dataset
data['contains_high_risk_keywords'] = data['extracted_text'].apply(check_high_risk_keywords)

# Count how many rows contain high-risk keywords and display examples
high_risk_counts = data['contains_high_risk_keywords'].value_counts()
high_risk_examples = data[data['contains_high_risk_keywords'] == True].head(5)

# Check for duplicate company entries
duplicate_companies = data['company'].duplicated().sum()

# Analyze the sources (URLs) for potential risks (e.g., suspicious domains)
data['suspicious_url'] = data['url'].str.contains(r'(offshore|leaks|scam|fraud)', case=False, na=False)

# Summarize suspicious URLs
suspicious_url_count = data['suspicious_url'].sum()
suspicious_url_examples = data[data['suspicious_url'] == True].head(5)

high_risk_counts, high_risk_examples, duplicate_companies, suspicious_url_count, suspicious_url_examples


  data['suspicious_url'] = data['url'].str.contains(r'(offshore|leaks|scam|fraud)', case=False, na=False)


(contains_high_risk_keywords
 False    466
 True      34
 Name: count, dtype: int64,
                                      company  \
 2       FORTUNEMAKER INVESTMENTS CORPORATION   
 4       FORTUNEMAKER INVESTMENTS CORPORATION   
 21  TIANSHENG INDUSTRY AND TRADING CO., LTD.   
 85                      8808 HOLDING LIMITED   
 87                      8808 HOLDING LIMITED   
 
                                                   url  \
 2   https://foe.org/blog/2014-04-the-schwarzenegge...   
 4   https://www.globalwitness.org/en/archive/schwa...   
 21      https://offshoreleaks.icij.org/nodes/10000001   
 85  https://www.scc.virginia.gov/pages/Application...   
 87                        https://www.man.com/contact   
 
                                        extracted_text  \
 2   The Schwarzenegger dilemma:Years of investing ...   
 4   Schwarzenegger’s multi-million dollar ‘robo-fu...   
 21  TIANSHENG INDUSTRY AND TRADING CO., | ICIJ Off...   
 85  Virginia SCC - Application Forms

In [None]:
# Define high-risk keywords
high_risk_keywords = [
    "fraud", "corruption", "scam", "lawsuit", "illegal", "sanction",
    "investigation", "criminal", "violation", "money laundering", "bribe"
]

# Function to check for high-risk keywords in the text
def check_high_risk_keywords(text):
    if pd.isna(text):
        return False
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in high_risk_keywords)

# Apply the function to create the column
data['contains_high_risk_keywords'] = data['extracted_text'].apply(check_high_risk_keywords)


In [None]:
print(data.columns)


Index(['company', 'url', 'extracted_text', 'related_keywords', 'risk_level',
       'risk_code', 'text_relates_to_company',
       'nlp_analysis_relates_to_company', 'contains_high_risk_keywords',
       'suspicious_url'],
      dtype='object')


In [None]:
high_risk_entries = data[data['contains_high_risk_keywords'] | data['suspicious_url']]
from IPython.display import display
display(high_risk_entries)


Unnamed: 0,company,url,extracted_text,related_keywords,risk_level,risk_code,text_relates_to_company,nlp_analysis_relates_to_company,contains_high_risk_keywords,suspicious_url
2,FORTUNEMAKER INVESTMENTS CORPORATION,https://foe.org/blog/2014-04-the-schwarzenegge...,The Schwarzenegger dilemma:Years of investing ...,No match,Low Risk,0,False,False,True,False
4,FORTUNEMAKER INVESTMENTS CORPORATION,https://www.globalwitness.org/en/archive/schwa...,Schwarzenegger’s multi-million dollar ‘robo-fu...,"court, violation",Medium Risk,1,False,False,True,False
21,"TIANSHENG INDUSTRY AND TRADING CO., LTD.",https://offshoreleaks.icij.org/nodes/10000001,"TIANSHENG INDUSTRY AND TRADING CO., | ICIJ Off...","corruption, investigation",High Risk,2,False,False,True,True
85,8808 HOLDING LIMITED,https://www.scc.virginia.gov/pages/Application...,Virginia SCC - Application Forms Home Financia...,"investigation, debt",Medium Risk,1,False,False,True,False
87,8808 HOLDING LIMITED,https://www.man.com/contact,Contact | Man Group You are now exiting our we...,"court, investigation",Medium Risk,1,False,False,True,False
126,"Wide International Trading Co., Ltd.",https://home.treasury.gov/news/press-releases/...,Treasury Targets Large Iranian Military Financ...,sanctions,High Risk,2,False,False,True,False
133,JIE LUN INVESTMENT LIMITED,https://en.zhonglun.com/team/xujie.html,﻿ The official website of Zhong Lun Law Firm J...,"court, crime, litigation",High Risk,2,False,False,True,False
158,FORTUNE PALACE LIMITED,https://fortune.com/2011/07/28/inside-pfizers-...,Inside Pfizer’s palace coup | Fortune Home Pag...,"court, investigation, violation",Medium Risk,1,False,False,True,False
191,LAKE STREET INVESTMENTS LTD.,https://www.linkedin.com/company/lake-street-c...,"Lake Street Capital Markets, LLC | LinkedIn Sk...",No match,Low Risk,0,False,False,True,False
196,LAKE STREET INVESTMENTS LTD.,https://www.treasury.gov/auctions/treasury/rp/...,US Dept of the Treasury Seized Real Property A...,court,Medium Risk,1,False,False,True,False


In [None]:
# Display the high-risk entries in the Colab environment
high_risk_entries


Unnamed: 0,company,url,extracted_text,related_keywords,risk_level,risk_code,text_relates_to_company,nlp_analysis_relates_to_company,contains_high_risk_keywords,suspicious_url
2,FORTUNEMAKER INVESTMENTS CORPORATION,https://foe.org/blog/2014-04-the-schwarzenegge...,The Schwarzenegger dilemma:Years of investing ...,No match,Low Risk,0,False,False,True,False
4,FORTUNEMAKER INVESTMENTS CORPORATION,https://www.globalwitness.org/en/archive/schwa...,Schwarzenegger’s multi-million dollar ‘robo-fu...,"court, violation",Medium Risk,1,False,False,True,False
21,"TIANSHENG INDUSTRY AND TRADING CO., LTD.",https://offshoreleaks.icij.org/nodes/10000001,"TIANSHENG INDUSTRY AND TRADING CO., | ICIJ Off...","corruption, investigation",High Risk,2,False,False,True,True
85,8808 HOLDING LIMITED,https://www.scc.virginia.gov/pages/Application...,Virginia SCC - Application Forms Home Financia...,"investigation, debt",Medium Risk,1,False,False,True,False
87,8808 HOLDING LIMITED,https://www.man.com/contact,Contact | Man Group You are now exiting our we...,"court, investigation",Medium Risk,1,False,False,True,False
126,"Wide International Trading Co., Ltd.",https://home.treasury.gov/news/press-releases/...,Treasury Targets Large Iranian Military Financ...,sanctions,High Risk,2,False,False,True,False
133,JIE LUN INVESTMENT LIMITED,https://en.zhonglun.com/team/xujie.html,﻿ The official website of Zhong Lun Law Firm J...,"court, crime, litigation",High Risk,2,False,False,True,False
158,FORTUNE PALACE LIMITED,https://fortune.com/2011/07/28/inside-pfizers-...,Inside Pfizer’s palace coup | Fortune Home Pag...,"court, investigation, violation",Medium Risk,1,False,False,True,False
191,LAKE STREET INVESTMENTS LTD.,https://www.linkedin.com/company/lake-street-c...,"Lake Street Capital Markets, LLC | LinkedIn Sk...",No match,Low Risk,0,False,False,True,False
196,LAKE STREET INVESTMENTS LTD.,https://www.treasury.gov/auctions/treasury/rp/...,US Dept of the Treasury Seized Real Property A...,court,Medium Risk,1,False,False,True,False


In [None]:
# Save the high-risk entries to a CSV file for download
high_risk_entries.to_csv('high_risk_entries.csv', index=False)

# Provide a download link for the CSV file
from google.colab import files
files.download('high_risk_entries.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from textblob import TextBlob

# Function to perform sentiment analysis
def analyze_sentiment(text):
    if pd.isna(text) or not text.strip():
        return "Neutral"  # Treat missing or empty text as Neutral
    analysis = TextBlob(text)
    # Determine sentiment polarity: Positive (>0), Negative (<0), Neutral (=0)
    if analysis.sentiment.polarity > 0:
        return "Positive"
    elif analysis.sentiment.polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Apply sentiment analysis to the 'extracted_text' column
data['sentiment'] = data['extracted_text'].apply(analyze_sentiment)

# Count sentiment distribution and display examples for each sentiment
sentiment_counts = data['sentiment'].value_counts()
examples_positive = data[data['sentiment'] == "Positive"].head(5)
examples_negative = data[data['sentiment'] == "Negative"].head(5)
examples_neutral = data[data['sentiment'] == "Neutral"].head(5)

sentiment_counts, examples_positive, examples_negative, examples_neutral


(sentiment
 Positive    348
 Neutral     100
 Negative     52
 Name: count, dtype: int64,
                                 company  \
 0  FORTUNEMAKER INVESTMENTS CORPORATION   
 1  FORTUNEMAKER INVESTMENTS CORPORATION   
 2  FORTUNEMAKER INVESTMENTS CORPORATION   
 4  FORTUNEMAKER INVESTMENTS CORPORATION   
 5  FORTUNEMAKER INVESTMENTS CORPORATION   
 
                                                  url  \
 0  https://www.forbes.com/sites/daveknox/2022/07/...   
 1                               https://fortune.com/   
 2  https://foe.org/blog/2014-04-the-schwarzenegge...   
 4  https://www.globalwitness.org/en/archive/schwa...   
 5  https://sku.is/patrick-schwarzenegger-talks-cp...   
 
                                       extracted_text  related_keywords  \
 0  Why Brain Health Is Patrick Schwarzenegger’s L...          No match   
 1  Fortune - Fortune 500 Daily & Breaking Busines...              debt   
 2  The Schwarzenegger dilemma:Years of investing ...          No match   


Code to Analyze Domain Patterns

In [None]:
# Extract domain names from the URLs
data['domain'] = data['url'].str.extract(r'//([^/]+)')

# Analyze domain frequency for all records
domain_counts = data['domain'].value_counts()

# Analyze domain frequency for high-risk cases
high_risk_domains = data[data['contains_high_risk_keywords'] | data['suspicious_url']]
high_risk_domain_counts = high_risk_domains['domain'].value_counts()

# Combine overall and high-risk domain counts into a DataFrame
domain_analysis = pd.DataFrame({
    'Overall Count': domain_counts,
    'High-Risk Count': high_risk_domain_counts
}).fillna(0)

# Display the combined DataFrame
domain_analysis.head(10)  # Adjust the number of rows to display more results if needed


Unnamed: 0_level_0,Overall Count,High-Risk Count
domain,Unnamed: 1_level_1,Unnamed: 2_level_1
4scic.com,2,0.0
about.gitlab.com,1,0.0
actionfacilities.com,1,0.0
allianceflaxlinenhemp.eu,1,0.0
angelikafilmcenter.com,1,0.0
apps.apple.com,1,0.0
apps3.web.maine.gov,1,0.0
bbs.fobshanghai.com,3,0.0
bonustrade.company,1,0.0
bonusworldtradelimited.com,1,0.0


Code to Save Results for Download

In [None]:
# Save the domain analysis to a CSV file
domain_analysis.to_csv('domain_analysis.csv')

# Provide a download link for the CSV file
from google.colab import files
files.download('domain_analysis.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

5. Duplicate Management
Investigate the 90 duplicate companies:
Are duplicates the same entity, or do they represent different contexts?

In [None]:
# Group duplicates based on company name
duplicate_companies = data[data.duplicated(subset=['company'], keep=False)]

# Investigate duplicates: Count instances and unique contexts for each duplicate company
duplicate_summary = duplicate_companies.groupby('company').agg(
    Count=('company', 'size'),  # Total instances of the company
    UniqueURLs=('url', 'nunique'),  # Unique URLs associated with the company
    RiskLevels=('risk_level', 'nunique'),  # Count of distinct risk levels
    Keywords=('related_keywords', lambda x: list(set(x)))  # Unique keywords
).sort_values(by='Count', ascending=False)

# Display duplicate summary
duplicate_summary.head(10)  # Display the top 10 duplicate companies


Unnamed: 0_level_0,Count,UniqueURLs,RiskLevels,Keywords
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8808 HOLDING LIMITED,10,10,3,"[court, investigation, investigation, debt, No..."
PRESTIGE INTERNATIONAL CORP.,10,10,3,"[court, No match, nan]"
KENT DEVELOPMENT LIMITED,10,10,1,[No match]
KIND INTERNATIONAL INC.,10,10,2,"[court, litigation, No match]"
LAKE STREET INVESTMENTS LTD.,10,10,2,"[litigation, court, debt, No match]"
LONSDALE LIMITED,10,10,2,"[No match, court, insolvency]"
"MEIHO INTERNATIONAL CO., LTD.",10,10,1,[No match]
NECY CONSULTANTS LIMITED,10,10,2,"[court, No match]"
NEW IDEA LIMITED,10,10,2,"[litigation, penalty, No match]"
"NINGBO RAPID INTERNATIONAL TRADING CO., LTD.",10,10,2,"[sanctions, No match]"


Save Results to a File

In [None]:
# Save the duplicate summary to a CSV file
duplicate_summary.to_csv('duplicate_companies_analysis.csv')

# Provide a download link
from google.colab import files
files.download('duplicate_companies_analysis.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import requests

# Function to query the Google Knowledge Graph API
def get_google_knowledge_graph_data(company_name, api_key):
    url = "https://kgsearch.googleapis.com/v1/entities:search"
    params = {
        'query': company_name,
        'key': api_key,
        'limit': 1,  # Number of results to fetch
        'indent': True  # Format the response for readability
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()  # Return the API response
    else:
        return {"error": f"API request failed with status code {response.status_code}"}

# Example usage
api_key = "xxx"  # Replace with your API key
company_name = "Google"
data = get_google_knowledge_graph_data(company_name, api_key)
data


{'@context': {'goog': 'http://schema.googleapis.com/',
  'resultScore': 'goog:resultScore',
  'kg': 'http://g.co/kg',
  'detailedDescription': 'goog:detailedDescription',
  '@vocab': 'http://schema.org/',
  'EntitySearchResult': 'goog:EntitySearchResult'},
 '@type': 'ItemList',
 'itemListElement': [{'result': {'image': {'url': 'https://en.m.wikipedia.org/wiki/File:Google_2015_logo.svg',
     'contentUrl': 'https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSA6xeoqtQcudXUieRkMQgiB48BrQP5tZQ6K3wenlV7FNsccVWA'},
    'description': 'IT corporation',
    '@id': 'kg:/m/045c7b',
    'detailedDescription': {'url': 'https://en.wikipedia.org/wiki/Google',
     'articleBody': 'Google LLC is an American-based multinational corporation and technology company focusing on online advertising, search engine technology, cloud computing, computer software, quantum computing, e-commerce, consumer electronics, and artificial intelligence. ',
     'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_

1. Load the CSV File and Extract Company Names
Here’s the code to load the CSV and extract the first 10 company names:

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('Offshore Leaks-entities.csv', low_memory=False)

# Extract the first 10 company names
company_names = df['name'][0:20].tolist()

# Display the extracted names
print("Company Names:", company_names)


Company Names: ['TIANSHENG INDUSTRY AND TRADING CO., LTD.', 'NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.', 'HOTFOCUS CO., LTD.', 'SKY-BLUE GIFTS & TOYS CO., LTD.', 'FORTUNEMAKER INVESTMENTS CORPORATION', '8808 HOLDING LIMITED', 'KENT DEVELOPMENT LIMITED', 'BONUS TRADE LIMITED', 'AMARANDAN LTD.', 'NEW IDEA LIMITED', 'HUGH POWER LIMITED', 'ULTIMATE GROUP LIMITED', 'VICTORY GROUP LIMITED', 'CHARTER MARK LIMITED', 'Wide International Trading Co., Ltd.', 'HTSS ET CAPITAL LIMITED', 'JIE LUN INVESTMENT LIMITED', 'FORTUNE PALACE LIMITED', 'LAKE STREET INVESTMENTS LTD.', 'NINGBO RAPID INTERNATIONAL TRADING CO., LTD.']


2. Query the Google Knowledge Graph API for Each Company
Now, use the extracted names to query the API.

In [None]:
import requests

# Google Knowledge Graph API function
def get_google_knowledge_graph_data(company_name, api_key):
    url = "https://kgsearch.googleapis.com/v1/entities:search"
    params = {
        'query': company_name,
        'key': api_key,
        'limit': 1,  # Fetch only the top result
        'indent': True  # Pretty print the response
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"API request failed with status code {response.status_code}"}

# API Key (replace with your own)
api_key = "xxx"

# Query the API for the first 10 company names
enriched_data = []
for company in company_names:
    result = get_google_knowledge_graph_data(company, api_key)
    enriched_data.append({
        "Company Name": company,
        "API Result": result
    })

# Display the results
for item in enriched_data:
    print(f"Company: {item['Company Name']}")
    print(f"Result: {item['API Result']}")


Company: TIANSHENG INDUSTRY AND TRADING CO., LTD.
Result: {'@context': {'EntitySearchResult': 'goog:EntitySearchResult', '@vocab': 'http://schema.org/', 'kg': 'http://g.co/kg', 'detailedDescription': 'goog:detailedDescription', 'goog': 'http://schema.googleapis.com/', 'resultScore': 'goog:resultScore'}, '@type': 'ItemList', 'itemListElement': []}
Company: NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.
Result: {'@context': {'kg': 'http://g.co/kg', 'goog': 'http://schema.googleapis.com/', 'resultScore': 'goog:resultScore', 'detailedDescription': 'goog:detailedDescription', '@vocab': 'http://schema.org/', 'EntitySearchResult': 'goog:EntitySearchResult'}, '@type': 'ItemList', 'itemListElement': []}
Company: HOTFOCUS CO., LTD.
Result: {'@context': {'detailedDescription': 'goog:detailedDescription', '@vocab': 'http://schema.org/', 'kg': 'http://g.co/kg', 'goog': 'http://schema.googleapis.com/', 'resultScore': 'goog:resultScore', 'EntitySearchResult': 'goog:EntitySearchResult'}, '@type': 'ItemLi

3. Save Results to a File
If you want to save the results for further analysis:

In [None]:
import json

# Save the enriched data to a JSON file
with open('enriched_results.json', 'w') as f:
    json.dump(enriched_data, f, indent=4)

# If needed, save to a CSV file (basic summary)
results_summary = pd.DataFrame({
    "Company Name": [item["Company Name"] for item in enriched_data],
    "API Result Summary": [str(item["API Result"])[:100] for item in enriched_data]  # Truncate long results
})
results_summary.to_csv('enriched_results.csv', index=False)

# Provide a download link in Colab
from google.colab import files
files.download('enriched_results.json')
files.download('enriched_results.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Updated Code with Google Knowledge Graph Integration**

In [None]:
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
from requests.exceptions import RequestException, SSLError

# Set up your API keys
google_api_key = 'xxx'
google_cse_id = 'xxx'

# Load the CSV file to get the first 100 company names
df = pd.read_csv('Offshore Leaks-entities.csv', low_memory=False)
# company_names = df['name'].head(100).tolist()
company_names = df['name'][0:5].tolist()  # Slice from 1st to 10th company

# Keywords to search for
keywords = [
    "court", "criminal case", "accusation", "crime", "corruption", "penalty",
    "investigation", "insolvency", "debt", "violation", "arrested", "sanctions",
    "litigation", "shell company", "blackmail"
]

# Function to perform Google search and get the first page
def google_search(search_term, api_key, cse_id, num_pages=1):
    service = build("customsearch", "v1", developerKey=api_key)
    results = []
    for page in range(num_pages):
        start_index = page * 1 #10 + 1  # Pagination starts at 1, 11, 21, ...
        res = service.cse().list(q=search_term, cx=cse_id, start=start_index).execute()
        if 'items' in res:
            results.extend(res['items'])
        time.sleep(1)  # Avoid rate limits by adding a delay
    return results

# Function to fetch data from the Google Knowledge Graph API
def get_google_knowledge_graph_data(company_name, api_key):
    url = "https://kgsearch.googleapis.com/v1/entities:search"
    params = {
        'query': company_name,
        'key': api_key,
        'limit': 1,  # Fetch only the top result
        'indent': True  # Pretty print the response
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Function to extract text from a URL
def extract_text_from_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    try:
        response = requests.get(url, headers=headers, verify=False, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text = ' '.join(p.text for p in soup.find_all('p'))
            return text
        else:
            print(f"Failed to fetch {url} with status code {response.status_code}")
            return ""
    except SSLError as e:
        print(f"SSL Error when accessing {url}: {e}")
        return ""
    except RequestException as e:
        print(f"Request failed for {url}: {e}")
        return ""

# Function to check if text contains any keywords
def contains_keywords(text, keywords):
    matched_keywords = [kw for kw in keywords if kw.lower() in text.lower()]
    return ", ".join(matched_keywords) if matched_keywords else "No match"

# Risk classification function
def classify_risk(related_keywords):
    if any(keyword in related_keywords.lower() for keyword in ["sanctions", "criminal", "crime", "corruption", "shell company"]):
        return "High Risk", 2
    elif related_keywords != "No match" and related_keywords != "No text extracted.":
        return "Medium Risk", 1
    else:
        return "Low Risk", 0

# Main process
data = pd.DataFrame(columns=[
    'company', 'url', 'extracted_text', 'related_keywords',
    'risk_level', 'risk_code', 'knowledge_graph_info'
])

for company_name in company_names:
    # Google Search
    results = google_search(company_name, google_api_key, google_cse_id, num_pages=1)
    for result in results:
        link = result.get('link')
        extracted_text = extract_text_from_url(link)
        related_keywords = contains_keywords(extracted_text, keywords)

        # Determine risk level and risk code
        risk_level, risk_code = classify_risk(related_keywords)

        # Fetch Knowledge Graph Data
        kg_data = get_google_knowledge_graph_data(company_name, google_api_key)
        kg_summary = (
            kg_data['itemListElement'][0]['result'].get('detailedDescription', {}).get('articleBody', 'N/A')
            if kg_data and 'itemListElement' in kg_data and len(kg_data['itemListElement']) > 0
            else "N/A"
        )

        # Create a new row and append it to the main DataFrame
        new_row = pd.DataFrame({
            'company': [company_name],
            'url': [link],
            'extracted_text': [extracted_text],
            'related_keywords': [related_keywords],
            'risk_level': [risk_level],
            'risk_code': [risk_code],
            'knowledge_graph_info': [kg_summary]
        })
        data = pd.concat([data, new_row], ignore_index=True)

        # Delay to avoid rate limits
        time.sleep(1)

# Save the results to a CSV file
data.to_csv('100_company_analysis_with_kg_results.csv', index=False, escapechar='\\', quoting=csv.QUOTE_MINIMAL)

print("Analysis complete. Results saved to '100_company_analysis_with_kg_results.csv'.")




Failed to fetch https://www.echemi.com/shop-us20211025100945397/index.html with status code 403




Failed to fetch https://transparint.com/documents/Panama_Papers_Entity_Names_List_2.txt with status code 406




Failed to fetch https://www.thelancet.com/cms/10.1016/S2666-5247(22)00387-1/attachment/f5d96b72-ad53-4d60-8c94-6cb3d89776ab/mmc6.xlsx with status code 403




Failed to fetch https://www.sec.gov/Archives/edgar/data/1780731/000121390023040481/f20f2022_sunrisenew.htm with status code 403




Failed to fetch https://hotfocusinc.com/ with status code 406




Failed to fetch https://www.indeed.com/cmp/Hot-Focus/reviews with status code 403




Failed to fetch https://www.faire.com/brand/b_9y45lmodfz with status code 403




Failed to fetch https://www.reddit.com/r/TikTokCringe/comments/16r2k8n/how_much_of_drew_barrymores_fortune_comes_from/ with status code 403




Failed to fetch https://www.inc.com/jeff-haden/arnold-schwarzenegger-just-gave-perfect-reason-to-keep-your-full-time-job-while-starting-your-own-business.html with status code 403




Analysis complete. Results saved to '100_company_analysis_with_kg_results.csv'.
