<a href="https://colab.research.google.com/github/NataKrj/AI-project-2024/blob/main/web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-search-results
!pip install requests beautifulsoup4



In [65]:
import csv
import os
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import pandas as pd

# Load the CSV file
file_path = 'nodes-entities.csv'
df = pd.read_csv(file_path, usecols=['name'])
companies = df['name'].dropna().unique()[:10]  # Limit to first 10 companies

# List of word combinations to search with each company
keywords = [
    "court", "criminal case", "accusation", "crime", "corruption", "penalty", "investigation",
    "insolvency", "debt", "violation", "arrested", "sanctions", "litigation", "shell company", "blackmail"
]

# Setting up directories
output_dir = "output"
links_file = os.path.join(output_dir, "matched_links.csv")
os.makedirs(output_dir, exist_ok=True)

def duckduckgo_search(query):
    query = urllib.parse.quote_plus(query)
    url = f"https://html.duckduckgo.com/html/?q={query}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching results for query: {query}, Error: {e}")
        return None

def parse_links_duckduckgo(html):
    soup = BeautifulSoup(html, 'html.parser')
    links = []
    for link in soup.find_all('a', class_='result__a', href=True):
        links.append(link['href'])
    return links

def search_company(company, keyword):
    query = f"{company} {keyword}"
    print(f"Searching for: {query}")
    html = duckduckgo_search(query)
    if html:
        links = parse_links_duckduckgo(html)
        if links:
            return links[0]  # Return the first link found
    return "No relevant link found"

def main():
    # Prepare CSV file for storing matched links
    with open(links_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Company", "Keyword", "Source"])

        # Iterate through each company and keyword
        for company in companies:
            for keyword in keywords:
                try:
                    # Search and collect the first result link
                    source = search_company(company, keyword)
                    writer.writerow([company, keyword, source])
                    time.sleep(3)  # Increase sleep time to prevent blocking
                except Exception as e:
                    print(f"Error searching for {company} with keyword {keyword}: {e}")

if __name__ == "__main__":
    main()

Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. court
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. criminal case
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. accusation
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. crime
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. corruption
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. penalty
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. investigation
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. insolvency
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. debt
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. violation
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. arrested
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. sanctions
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. litigation
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. shell company
Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD. blackmail
Searching 

In [79]:
import os
import json
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import csv
from urllib.parse import quote

# Load the CSV file
file_path = 'nodes-entities.csv'
df = pd.read_csv(file_path, usecols=['name'])
companies = df['name'].dropna().unique()[:10]  # Limit to first 10 companies

# Keywords to search for
keywords = [
    "court", "criminal case", "accusation", "crime", "corruption", "penalty",
    "investigation", "insolvency", "debt", "violation", "arrested", "sanctions",
    "litigation", "shell company", "blackmail"
]

# Output file to save results
output_file = "company_search_results.csv"

# Headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}

def search_company_with_keywords(company, keywords):
    # Use quotes around company name to improve accuracy and handle special characters
    query = f'"{company}" ' + ' OR '.join([f'"{kw}"' for kw in keywords])
    url = f"https://www.google.com/search?q={quote(query)}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch results for {company}: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    search_results = []

    for g in soup.find_all('div', class_='tF2Cxc'):
        title = g.find('h3').text if g.find('h3') else ""
        link = g.find('a')['href'] if g.find('a') else ""
        snippet = g.find('span', class_='aCOpRe').text if g.find('span', class_='aCOpRe') else ""

        if title and link:
            matching_keywords = ', '.join([kw for kw in keywords if kw.lower() in title.lower() or kw.lower() in snippet.lower()])
            search_results.append({
                "company": company,
                "matching_keywords": matching_keywords if matching_keywords else "None",
                "link": link
            })

    return search_results

def main():
    all_results = []

    for company in companies:
        print(f"Searching for: {company}")
        search_results = search_company_with_keywords(company, keywords)
        all_results.extend(search_results)
        time.sleep(2)  # Sleep to avoid being blocked by Google

    # Save results to a CSV file
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["company", "matching_keywords", "link"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in all_results:
            writer.writerow(result)

    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    main()

Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD.
Searching for: NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.
Searching for: HOTFOCUS CO., LTD.
Searching for: SKY-BLUE GIFTS & TOYS CO., LTD.
Searching for: FORTUNEMAKER INVESTMENTS CORPORATION
Searching for: 8808 HOLDING LIMITED
Searching for: KENT DEVELOPMENT LIMITED
Searching for: BONUS TRADE LIMITED
Searching for: AMARANDAN LTD.
Searching for: NEW IDEA LIMITED
Results saved to company_search_results.csv


In [78]:
import os
import json
import csv
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import quote

# Load the CSV file
file_path = 'nodes-entities.csv'
df = pd.read_csv(file_path, usecols=['name'])
companies = df['name'].dropna().unique()[:10]  # Limit to first 10 companies

# Keywords to search for
keywords = [
    "court", "criminal case", "accusation", "crime", "corruption", "penalty",
    "investigation", "insolvency", "debt", "violation", "arrested", "sanctions",
    "litigation", "shell company", "blackmail"
]

# Output file to save results
output_file = "company_search_results.csv"

# Headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}

def search_company_with_keywords(company, keywords):
    # Use quotes around company name to improve accuracy and handle special characters
    query = f'"{company}" ' + ' OR '.join([f'"{kw}"' for kw in keywords])
    url = f"https://www.google.com/search?q={quote(query)}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch results for {company}: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    search_results = []

    for g in soup.find_all('div', class_='tF2Cxc'):
        title = g.find('h3').text if g.find('h3') else ""
        link = g.find('a')['href'] if g.find('a') else ""
        snippet = g.find('span', class_='aCOpRe').text if g.find('span', class_='aCOpRe') else ""

        if title and link:
            matching_keywords = ', '.join([kw for kw in keywords if kw.lower() in title.lower() or kw.lower() in snippet.lower()])
            search_results.append({
                "company": company,
                "matching_keywords": matching_keywords if matching_keywords else "None",
                "link": link
            })

    # Remove duplicate links
    unique_results = {result['link']: result for result in search_results}.values()
    return list(unique_results)

def main():
    all_results = []

    for company in companies:
        print(f"Searching for: {company}")
        search_results = search_company_with_keywords(company, keywords)
        all_results.extend(search_results)
        time.sleep(2)  # Sleep to avoid being blocked by Google

    # Save results to a CSV file
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["company", "matching_keywords", "link"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in all_results:
            writer.writerow(result)

    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    main()

Searching for: TIANSHENG INDUSTRY AND TRADING CO., LTD.
Searching for: NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.
Searching for: HOTFOCUS CO., LTD.
Searching for: SKY-BLUE GIFTS & TOYS CO., LTD.
Searching for: FORTUNEMAKER INVESTMENTS CORPORATION
Searching for: 8808 HOLDING LIMITED
Searching for: KENT DEVELOPMENT LIMITED
Searching for: BONUS TRADE LIMITED
Searching for: AMARANDAN LTD.
Searching for: NEW IDEA LIMITED
Results saved to company_search_results.csv
