<a href="https://colab.research.google.com/github/NataKrj/AI-project-2024/blob/main/20241109_Al%2Bproject_TSI_API_web_scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**WORK GOOD** google 1 page

In [None]:
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openai
import time

# Set up your API keys
google_api_key = 'xxx'
google_cse_id = 'xxx'
openai.api_key = 'xxx'

# Keywords to search for
keywords = [
    "court", "criminal case", "accusation", "crime", "corruption", "penalty",
    "investigation", "insolvency", "debt", "violation", "arrested", "sanctions",
    "litigation", "shell company", "blackmail"
]

# Function to perform Google search
def google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res.get('items', [])

# Function to extract text from a URL
def extract_text_from_url(url):
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text = ' '.join(p.text for p in soup.find_all('p'))
            return text
        else:
            print(f"Failed to fetch {url} with status code {response.status_code}")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
    return ""

# Function to check if text contains any keywords
def contains_keywords(text, keywords):
    matched_keywords = [kw for kw in keywords if kw.lower() in text.lower()]
    return ", ".join(matched_keywords) if matched_keywords else "No match"

# Function to analyze text with GPT-4
def analyze_text_with_gpt(text, company_name):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"Determine if the following text is related to {company_name}."},
                {"role": "user", "content": text}
            ],
            max_tokens=150
        )
        return response['choices'][0]['message']['content'].strip()
    except openai.error.OpenAIError as e:
        print(f"OpenAI API error: {e}")
        return "Analysis failed."

# Main process
company_name = 'SwedBank'
results = google_search(company_name, google_api_key, google_cse_id, num=10)

# Create DataFrame to store results
data = pd.DataFrame(columns=['company', 'url', 'extracted_text', 'gpt_analysis', 'related_keywords'])

for result in results:
    title = result['title']
    link = result['link']

    # Extract text from each URL
    extracted_text = extract_text_from_url(link)

    # Analyze extracted text with GPT-4 to determine relevance to the company
    if extracted_text:
        gpt_analysis = analyze_text_with_gpt(extracted_text, company_name)
        related_keywords = contains_keywords(extracted_text, keywords)
    else:
        gpt_analysis = "No text extracted."
        related_keywords = "No text extracted."

    # Create a new row as a DataFrame and append it to the main DataFrame
    new_row = pd.DataFrame({
        'company': [company_name],
        'url': [link],
        'extracted_text': [extracted_text],
        'gpt_analysis': [gpt_analysis],
        'related_keywords': [related_keywords]  # Add matched keywords
    })
    data = pd.concat([data, new_row], ignore_index=True)  # Concatenate new row

    # Delay to avoid rate limits
    time.sleep(1)  # Adjust delay if necessary

# Save the results to a CSV file
data.to_csv('company_analysis_results.csv', index=False)
print(data)


**WORK GOOD** google 10 page

In [None]:
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openai
import time

# Set up your API keys
google_api_key = 'xxx'
google_cse_id = 'xxx'
openai.api_key = 'xxx'

# Keywords to search for
keywords = [
    "court", "criminal case", "accusation", "crime", "corruption", "penalty",
    "investigation", "insolvency", "debt", "violation", "arrested", "sanctions",
    "litigation", "shell company", "blackmail"
]

# Function to perform Google search and get multiple pages
def google_search(search_term, api_key, cse_id, num_pages=10):
    service = build("customsearch", "v1", developerKey=api_key)
    results = []

    for page in range(num_pages):
        start_index = page * 10 + 1  # 1, 11, 21, ... for each page
        res = service.cse().list(q=search_term, cx=cse_id, start=start_index).execute()
        if 'items' in res:
            results.extend(res['items'])
        time.sleep(1)  # Avoid rate limits by adding a delay

    return results

# Function to extract text from a URL
def extract_text_from_url(url):
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text = ' '.join(p.text for p in soup.find_all('p'))
            return text
        else:
            print(f"Failed to fetch {url} with status code {response.status_code}")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
    return ""

# Function to check if text contains any keywords
def contains_keywords(text, keywords):
    matched_keywords = [kw for kw in keywords if kw.lower() in text.lower()]
    return ", ".join(matched_keywords) if matched_keywords else "No match"

# Function to analyze text with GPT-4
def analyze_text_with_gpt(text, company_name):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"Determine if the following text is related to {company_name}."},
                {"role": "user", "content": text}
            ],
            max_tokens=150
        )
        return response['choices'][0]['message']['content'].strip()
    except openai.error.OpenAIError as e:
        print(f"OpenAI API error: {e}")
        return "Analysis failed."

# Main process
company_name = 'SWEDBANK BALTICS, AS'
results = google_search(company_name, google_api_key, google_cse_id, num_pages=10)

# Create DataFrame to store results
data = pd.DataFrame(columns=['company', 'url', 'extracted_text', 'gpt_analysis', 'related_keywords'])

for result in results:
    title = result['title']
    link = result['link']

    # Extract text from each URL
    extracted_text = extract_text_from_url(link)

    # Analyze extracted text with GPT-4 to determine relevance to the company
    if extracted_text:
        gpt_analysis = analyze_text_with_gpt(extracted_text, company_name)
        related_keywords = contains_keywords(extracted_text, keywords)
    else:
        gpt_analysis = "No text extracted."
        related_keywords = "No text extracted."

    # Create a new row as a DataFrame and append it to the main DataFrame
    new_row = pd.DataFrame({
        'company': [company_name],
        'url': [link],
        'extracted_text': [extracted_text],
        'gpt_analysis': [gpt_analysis],
        'related_keywords': [related_keywords]  # Add matched keywords
    })
    data = pd.concat([data, new_row], ignore_index=True)  # Concatenate new row

    # Delay to avoid rate limits
    time.sleep(1)  # Adjust delay if necessary

# Save the results to a CSV file
data.to_csv('company_analysis_results.csv', index=False)
print(data)


**20241118 NEW working fast script**

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import string
import re
from googleapiclient.discovery import build
from requests.exceptions import RequestException, SSLError



# Set up your API keys
google_api_key = 'xxx'
google_cse_id = 'xxx'

# Load the CSV file to get company names
df = pd.read_csv('Offshore Leaks-entities.csv', low_memory=False, encoding='utf-8')
company_names = df['name'][0:20].tolist()  # Slice from 1st to 10th company
# company_names = df['name'].tolist()  # Slice from 1st to 10th company

# Keywords to search for
keywords = [
    "court", "criminal case", "accusation", "crime", "corruption", "penalty",
    "investigation", "insolvency", "debt", "violation", "arrested", "sanctions",
    "litigation", "shell company", "blackmail"
]

def google_search(search_term, api_key, cse_id, start_index=1):
    service = build("customsearch", "v1", developerKey=api_key)
    try:
        res = service.cse().list(q=search_term, cx=cse_id, start=start_index).execute()
        return res.get('items', [])
    except Exception as e:
        print(f"Failed to search for {search_term} with error: {e}")
        return []

def extract_text_from_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    try:
        response = requests.get(url, headers=headers, verify=False, timeout=10)
        if response.status_code == 200:
            if 'text/html' in response.headers.get('Content-Type', ''):
                soup = BeautifulSoup(response.text, 'html.parser')
                for script in soup(["script", "style", "header", "footer", "form", "nav"]):
                    script.extract()
                for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
                    comment.extract()
                text = ' '.join(soup.stripped_strings)
                return text
            else:
                return "Non-text content skipped"
        else:
            return ""
    except (RequestException, SSLError) as e:
        return f"Request failed for {url}: {e}"

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    text = text.strip()
    return text

def contains_keywords(text, keywords):
    matched_keywords = [kw for kw in keywords if kw.lower() in text.lower()]
    return ", ".join(matched_keywords) if matched_keywords else "No match"

def classify_risk(related_keywords):
    if any(keyword in related_keywords.lower() for keyword in ["sanctions", "criminal", "crime", "corruption", "shell company"]):
        return "High Risk", 2
    elif related_keywords != "No match" and related_keywords != "":
        return "Medium Risk", 1
    else:
        return "Low Risk", 0

def process_company(company_name):
    results = google_search(company_name, google_api_key, google_cse_id)
    company_data = []
    for result in results:
        url = result['link']
        extracted_text = extract_text_from_url(url)
        if extracted_text != "Non-text content skipped":
            extracted_text = clean_text(extracted_text)
            related_keywords = contains_keywords(extracted_text, keywords)
            risk_level, risk_code = classify_risk(related_keywords)
            company_data.append({
                'company': company_name,
                'url': url,
                'extracted_text': extracted_text,
                'related_keywords': related_keywords,
                'risk_level': risk_level,
                'risk_code': risk_code
            })
        else:
            company_data.append({
                'company': company_name,
                'url': url,
                'extracted_text': "Skipped due to non-text content",
                'related_keywords': "None",
                'risk_level': "No Risk",
                'risk_code': 0
            })
    return company_data

# Use ThreadPoolExecutor to process companies in parallel
data = []
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_company, name): name for name in company_names}
    for future in as_completed(futures):
        data.extend(future.result())

# Convert list of dicts to DataFrame
df_results = pd.DataFrame(data)

# Save the DataFrame to a CSV file with proper encoding and escaping
df_results.to_csv('1_0_20_company_analysis_results.csv', index=False, escapechar='\\', encoding='utf-8', quoting=csv.QUOTE_ALL)
print("Data saved to company_analysis_results.csv.")

# Additionally, save the DataFrame to a Parquet file
df_results.to_parquet('1_0_20_company_analysis_results.parquet', engine='pyarrow', compression='snappy')
print("Data saved to company_analysis_results.parquet.")


# Save the DataFrame to a JSON file
df_results.to_json('1_0_20_company_analysis_results.json', orient='records', lines=True, force_ascii=False)
print("Data saved to company_analysis_results.json.")

# Create an SQLAlchemy engine instance
engine = create_engine('sqlite:///1_0_20_company_analysis_results_my_data.db')  # This will create a SQLite database file named 'my_data.db'

# Assume df_results is your DataFrame
df_results.to_sql('table_name', con=engine, index=False, if_exists='replace')  # Replace 'table_name' with your desired table name

print("Data saved to the database.")
