In [29]:
import os
import pandas as pd
import glob
from datetime import datetime
from bs4 import BeautifulSoup
import requests
import urllib
from utils.marketinsights import process_tables, get_city_mapping, get_contact_information, get_embeddings, get_industry, get_intersection, get_phone_mapping, label_country_by_city, label_country_by_phone, final_country, convert_to_datetime, AutoModel, AutoTokenizer

model_name = 'nomic-ai/nomic-embed-text-v1'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

selection = 'marketinsights'
directory = "./utils/data/Scraped News/"
file_pattern = f"{selection}_data_*.csv"
files = glob.glob(os.path.join(directory, file_pattern))
print(files)
latest_file = max(files, key=os.path.getctime)
print(latest_file)

df = pd.read_csv(latest_file)

df_temp = df['tables'].apply(lambda x: pd.Series(process_tables(x)))
df_temp.columns = ['Executives', 'Shareholders']
df[['Executives', 'Shareholders']] = df_temp

print('Labelling Country and Industry')

try:
    existing = pd.read_csv('utils/data/Scrape/phoneextensions.csv')
except:
    existing = pd.DataFrame()
phone_storage = get_phone_mapping(existing)
city_storage = get_city_mapping()
df['Industry'] = df['raw'].apply(get_industry)
df['Contact Information'] = df['raw'].apply(get_contact_information)
df['Country_phone'] = df['Contact Information'].apply(lambda x: label_country_by_phone(x, phone_storage))
df['Country_city'] = df['Contact Information'].apply(lambda x: label_country_by_city(x, city_storage))
df['Country_candidates'] = df.apply(lambda x: get_intersection(x.Country_phone, x.Country_city), axis = 1)
df['Country'] = df.apply(lambda x: final_country(x['Contact Information'], x.Country_candidates, tokenizer, model), axis = 1)

# Apply the conversion functions
df['Time'] = df['date'].apply(convert_to_datetime)
df.drop(['date'], axis = 1)
df.to_csv('utils/data/Scraped News/marketinsights_data_2024-07-11.csv')

<All keys matched successfully>


['./utils/data/Scraped News/marketinsights_data_temp.csv']
./utils/data/Scraped News/marketinsights_data_temp.csv
Labelling Country and Industry


In [31]:
df.columns

Index(['Unnamed: 0', 'title', 'link', 'ticker', 'date', 'source',
       'Article content', 'raw', 'tables', 'Executives', 'Shareholders',
       'Industry', 'Contact Information', 'Country_phone', 'Country_city',
       'Country_candidates', 'Country', 'Time'],
      dtype='object')

In [30]:
test_content = df.iloc[5]['Article content']
print(test_content)

Certain A Shares of Nanjing Railway New Technology Co.,Ltd. are subject to a Lock-Up Agreement Ending on 30-JUN-2024. These A Shares will be under lockup for 1101 days starting from 25-JUN-2021 to 30-JUN-2024.
Details:
The company's controlling shareholder and the actual controller Liu Jun; Shareholder and Director Ji Yigen committed that within 36 months since the date of listing of the present shares, there will not be no transfers nor entrustment of shares to any third party nor repurchase by the Company. If, within 6 months after the issuer's listing, the closing price of the shares is lower than issuance price for 20 consecutive trading days or if trading price is lower than issuance price after 6 month from listing, lock-up period will be automatically extended for another 6 months. If there is any case of dividends, bonus shares, capitalization of capital reserve and other similar cases, issue price will be adjusted according to ex-dividend and ex-interests. After the expiry of 

In [26]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# initialize our model and tokenizer
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')


def get_sentiment(text):
    tokens = tokenizer.encode_plus(text, add_special_tokens=False)
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    # define our starting position (0) and window size (number of tokens in each chunk)
    start = 0
    window_size = 512
    
    # initialize probabilities list
    probs_list = []
    
    start = 0
    window_size = 510  # we take 2 off here so that we can fit in our [CLS] and [SEP] tokens
    
    loop = True
    
    while loop:
        end = start + window_size
        if end >= total_len:
            loop = False
            end = total_len
        # (1) extract window from input_ids and attention_mask
        input_ids_chunk = input_ids[start:end]
        attention_mask_chunk = attention_mask[start:end]
        # (2) add [CLS] and [SEP]
        input_ids_chunk = [101] + input_ids_chunk + [102]
        attention_mask_chunk = [1] + attention_mask_chunk + [1]
        # (3) add padding upto window_size + 2 (512) tokens
        input_ids_chunk += [0] * (window_size - len(input_ids_chunk) + 2)
        attention_mask_chunk += [0] * (window_size - len(attention_mask_chunk) + 2)
        # (4) format into PyTorch tensors dictionary
        input_dict = {
            'input_ids': torch.Tensor([input_ids_chunk]).long(),
            'attention_mask': torch.Tensor([attention_mask_chunk]).int()
        }
        # (5) make logits prediction
        outputs = model(**input_dict)
        # (6) calculate softmax and append to list
        probs = torch.nn.functional.softmax(outputs[0], dim=-1)
        probs_list.append(probs)
    
        start = end

    stacks = torch.stack(probs_list)
    shape = stacks.shape
    with torch.no_grad():
        # we must include our stacks operation in here too
        stacks = torch.stack(probs_list)
        # now resize
        stacks = stacks.resize_(stacks.shape[0], stacks.shape[2])
        # finally, we can calculate the mean value for each sentiment class
        mean = stacks.mean(dim=0)
    winner = torch.argmax(mean).item()
    result = ['Positive', 'Negative', 'Neutral'][winner]
    return result

In [32]:
df['Sentiment'] = df['Article content'].apply(get_sentiment)

In [35]:
for i, row in df.iterrows():
    print('Content: \n')
    print(row['Article content'])
    print('\nSentiment: \n')
    print(row['Sentiment'])
    print('**********')

Content: 

Cordiant Digital Infrastructure Limited announced the refinancing of its Eurobond facility and repayment of the ?30 million vendor loan note agreed in connection with the purchase of Speed Fibre. The Company announces the signing of a new ?200 million Eurobond facility ("New Eurobond"), which will refinance the existing ?200 million Eurobond ("Original Eurobond") signed by the Company's indirect subsidiary Cordiant Digital Holdings Two Limited in June 2022. This refinancing provides greater certainty and flexibility for the group by extending the maturity of CORD's holding company-level term debt from September 2026 to July 2029, with a bullet repayment structure.
The Company is also to make full repayment of the circa ?30 million vendor loan note issued as part of the acquisition of Speed Fibre, which completed in October 2023. The repayment will be made from existing cash resources and will provide the Company greater flexibility as it takes forward its growth plans for Sp

In [19]:
from bs4 import BeautifulSoup
import pandas as pd
import urllib
from datetime import datetime, timedelta
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

def scrape_site():
    # URL of the news page
    url = 'https://www.penews.com/deals/1'
    
    # Set up the Chrome driver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode (no GUI)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
        # Wait for the CAPTCHA to be solved manually (or handle it using a CAPTCHA-solving service)
    WebDriverWait(driver, 300).until(EC.presence_of_element_located((By.TAG_NAME, 'iframe')))

    # Switch to the iframe containing the content
    driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))

    # Scrape the content after solving the CAPTCHA
    content = driver.page_source
    soup = BeautifulSoup(content, 'html.parser')

    # Example: Find and print all deal titles (this will depend on the actual HTML structure)
    deals = soup.find_all('div', class_='deal-title')
    for deal in deals:
        print(deal.text)

    # Close the WebDriver
    driver.quit()
    print(soup)
    # Extract deal information
    deals = []
    for deal in soup.find_all('div', class_='deal-item'):
        title = deal.find('h2', class_='deal-title').text.strip()
        date = deal.find('span', class_='deal-date').text.strip()
        link = deal.find('a', class_='deal-link')['href']
        deals.append({'Title': title, 'Date': date, 'Link': link})
    
    return pd.DataFrame(deals)

# Example usage
df = scrape_site()
print(df)


TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x0000000104c9aa80 chromedriver + 4385408
1   chromedriver                        0x0000000104c9338c chromedriver + 4354956
2   chromedriver                        0x00000001048b0b0c chromedriver + 281356
3   chromedriver                        0x00000001048f32f8 chromedriver + 553720
4   chromedriver                        0x000000010492bd24 chromedriver + 785700
5   chromedriver                        0x00000001048e7eec chromedriver + 507628
6   chromedriver                        0x00000001048e88c4 chromedriver + 510148
7   chromedriver                        0x0000000104c6243c chromedriver + 4154428
8   chromedriver                        0x0000000104c66ea0 chromedriver + 4173472
9   chromedriver                        0x0000000104c47ff8 chromedriver + 4046840
10  chromedriver                        0x0000000104c6778c chromedriver + 4175756
11  chromedriver                        0x0000000104c3afb8 chromedriver + 3993528
12  chromedriver                        0x0000000104c8521c chromedriver + 4297244
13  chromedriver                        0x0000000104c85398 chromedriver + 4297624
14  chromedriver                        0x0000000104c92f84 chromedriver + 4353924
15  libsystem_pthread.dylib             0x0000000194dcaf94 _pthread_start + 136
16  libsystem_pthread.dylib             0x0000000194dc5d34 thread_start + 8


In [20]:
df['Country_phone']

KeyError: 'Country_phone'

In [24]:
from serpapi import GoogleSearch
import pandas as pd
import streamlit as st

def search_company_management(company_name, api_key):
    # Define common terms used for management page
    management_terms = [
        "Management Team", "Executive Team", "Leadership", 
        "Our Team", "Meet Our Leaders", "Key Executives", 
        "Board of Directors", "Leadership Team", "Management", 
        "Executive Leadership"
    ]
    
    # Construct query with various combinations
    queries = [f"{company_name} {term}" for term in management_terms]
    
    params = {
        "q": queries,
        "engine": "google",
        "google_domain": "google.com",
        "num": 10,
        "api_key": api_key,
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    return results

# Example usage
api_key = st.secrets['serp_api_key']
company_name = 'Tesla'
search_results = search_company_management(company_name, api_key)
print(search_results)

{'search_metadata': {'id': '668f70652fe302778f1f50b3', 'status': 'Success', 'json_endpoint': 'https://serpapi.com/searches/89d86f10bc77b42d/668f70652fe302778f1f50b3.json', 'created_at': '2024-07-11 05:40:53 UTC', 'processed_at': '2024-07-11 05:40:53 UTC', 'google_url': 'https://www.google.com/search?q=Tesla+Our+Team&oq=Tesla+Our+Team&num=10&sourceid=chrome&ie=UTF-8', 'raw_html_file': 'https://serpapi.com/searches/89d86f10bc77b42d/668f70652fe302778f1f50b3.html', 'total_time_taken': 4.9}, 'search_parameters': {'engine': 'google', 'q': 'Tesla Our Team', 'google_domain': 'google.com', 'num': '10', 'device': 'desktop'}, 'search_information': {'query_displayed': 'Tesla Our Team', 'total_results': 145000000, 'time_taken_displayed': 0.3, 'organic_results_state': 'Results for exact spelling'}, 'inline_images': [{'link': 'https://www.google.com/search?sca_esv=7e710dd86bdccc50&sca_upv=1&q=Tesla+Our+Team&udm=2&source=univ&fir=nTdDJQf2hOFcBM%252CCz6qAmM1lOhX4M%252C_%253BRzAR60HxXZNLuM%252C93KeLC3u7