In [28]:
import os
import io
import re
import PyPDF2
import requests
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import display
import google.generativeai as genai

In [20]:
# Load environment variables from .env file
load_dotenv()

# Set your API key as an environment variable for security
os.environ['GOOGLE_API_KEY'] = os.getenv("GEMINI_API_KEY")

# Configure the API with your key
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

prompt_template = """Can you help me name all competitor of CP Group = CPAll CPF CPXTRA TRUE?
                    For example "เบทาโกร" "ไทยยูเนี่ยน" "คาร์กิลล์" "สหฟาร์ม" "CJ" "ฟาร์มเฮ้าส์" "Go Wholesale"'

                    Please return in dataframe format with 3 columns 
                    1.CP Group - CPF, CPALL, CPXTRA, TRUE
                    2.Category - Agri, Food, Retail, Convenience, Wholesale, Hypermarket, Telecom, Digital
                    3.Competitor - Betagro, Thai Union, Saha Farm, Cargill, TFG, Tyson, JBS, FamilyMart, Lawson108, Mini Big C, Go Wholesale, Big C, Tops, AIS,  NT, 3BB

                    For Abbreviation Please extend it such as NT you should replace with National Telecom(NT)
                    and the dataframe should be distinct competitor (1 row 1 competitor , can't repete competitor even it has different category) 
                    """
model = genai.GenerativeModel('gemini-2.5-pro')
#tweet_data = tweet
#formatted_prompt = prompt_template.format(tweet_content=tweet_data)
response = model.generate_content(prompt_template)

E0000 00:00:1759379003.442645  630048 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [21]:
response.text

'Of course, here is the list of competitors for each CP Group company in the requested dataframe format.\n\nThe list ensures each competitor is mentioned only once, assigned to their primary area of competition against a specific CP entity.\n\n```\n| CP Group   | Category      | Competitor                   |\n|:-----------|:--------------|:-----------------------------|\n| CPF        | Agri & Food   | Betagro                      |\n| CPF        | Food          | Thai Union Group             |\n| CPF        | Agri & Food   | Cargill                      |\n| CPF        | Agri & Food   | Saha Farm                    |\n| CPF        | Agri & Food   | Thai Foods Group (TFG)       |\n| CPF        | Food          | CJ Group                     |\n| CPF        | Food          | Farmhouse (President Bakery) |\n| CPF        | Food          | Tyson Foods                  |\n| CPF        | Food          | JBS                          |\n| CPALL      | Convenience   | FamilyMart                 

In [24]:
# Extract the markdown table part only
table_text = "\n".join([line for line in response.text.splitlines() if "|" in line])

# Convert to dataframe
df = pd.read_csv(StringIO(table_text), sep="|").iloc[:,1:-1]

# Clean columns and strip spaces
df.columns = df.columns.str.strip()
df = df.apply(lambda x: x.str.strip())

# Drop first row
df = df.drop(0).reset_index(drop=True)

In [26]:
# Define SET-listed companies with tickers
set_listed = {
    "Betagro": "BTG",
    "Thai Union Group": "TU",
    "Thai Foods Group (TFG)": "TFG",
    "Farmhouse (President Bakery)": "PB",
    "Big C": "BJC",  # Berli Jucker owns Big C
    "Tops (Central Food Retail)": "CRC", # Central Retail Corp
    "Advanced Info Service (AIS)": "ADVANC",
    "CJ Group": "CBG" # Under Carabao Group
}

# Global competitors
global_competitors = ["Cargill", "Tyson Foods", "JBS", "Lawson108"]

# Thai private companies (not listed but domestic)
thai_private = ["Saha Farm", "FamilyMart", "Mini Big C", "Go Wholesale", "National Telecom (NT)", "3BB (now part of AIS)"]

# Assign Type and SET_ticker columns
def classify_type(comp):
    if comp in set_listed:
        return "SET"
    elif comp in global_competitors:
        return "Global"
    elif comp in thai_private:
        return "Thai Private"
    else:
        return ""

def get_ticker(comp):
    return set_listed.get(comp, "")

df["Type"] = df["Competitor"].apply(classify_type)
df["SET_ticker"] = df["Competitor"].apply(get_ticker)

In [35]:
# assuming your dataframe is already called df
# find the CJ row
mask = df["Competitor"].str.contains("CJ", case=False, na=False)

# update Business and Category for that row
df.loc[mask, "CP Group"] = "CPALL"
df.loc[mask, "Category"] = "Convenience"

# Sort dataframe by CP Group and Category
df = df.sort_values(by=["CP Group", "Category", "Competitor"]).reset_index(drop=True)

In [36]:
df

Unnamed: 0,CP Group,Category,Competitor,Type,SET_ticker
0,CPALL,Convenience,CJ Group,SET,CBG
1,CPALL,Convenience,FamilyMart,Thai Private,
2,CPALL,Convenience,Lawson108,Global,
3,CPALL,Convenience,Mini Big C,Thai Private,
4,CPF,Agri & Food,Betagro,SET,BTG
5,CPF,Agri & Food,Cargill,Global,
6,CPF,Agri & Food,Saha Farm,Thai Private,
7,CPF,Agri & Food,Thai Foods Group (TFG),SET,TFG
8,CPF,Food,Farmhouse (President Bakery),SET,PB
9,CPF,Food,JBS,Global,


In [29]:
# Filter only SET-listed competitors
set_df = df[df["Type"] == "SET"]

# Convert to dictionary {Competitor: SET_ticker}
tickers = dict(zip(set_df["Competitor"], set_df["SET_ticker"]))

tickers

{'Betagro': 'BTG',
 'Thai Union Group': 'TU',
 'Thai Foods Group (TFG)': 'TFG',
 'CJ Group': 'CBG',
 'Farmhouse (President Bakery)': 'PB',
 'Big C': 'BJC',
 'Tops (Central Food Retail)': 'CRC',
 'Advanced Info Service (AIS)': 'ADVANC'}

In [30]:
def get_filing_urls(ticker):
    """
    Pull SET 'Company Filings' page and return URLs for One Reports (56-1).
    """
    base_url = f"https://www.set.or.th/en/market/product/stock/quote/{ticker}/company-news/filing"
    r = requests.get(base_url, timeout=10)
    soup = BeautifulSoup(r.text, "html.parser")

    # Look for PDF links containing "56-1" or "One Report"
    links = []
    for a in soup.find_all("a", href=True):
        if any(keyword in a.text for keyword in ["56-1", "One Report"]):
            links.append(a["href"])
    return links

In [31]:
def extract_risk_factors(pdf_url):
    """
    Download PDF and extract risk factors section by regex.
    """
    response = requests.get(pdf_url, timeout=15)
    reader = PyPDF2.PdfReader(io.BytesIO(response.content))
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"

    # Find "Risk Factors" section
    match = re.search(r"(Risk Factors.*?)(?=\n[A-Z][^\n]{0,50}\n)", text, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        return "Risk Factors section not found."

In [None]:
# # --- Run for one example company (e.g., TU) ---
# ticker = "TU"
# urls = get_filing_urls(ticker)

# if urls:
#     pdf_url = urls[0] if urls[0].startswith("http") else "https://www.set.or.th" + urls[0]
#     risks = extract_risk_factors(pdf_url)
#     print(f"Risk Factors for {ticker}:\n", risks[:2000])  # Print first 2000 chars
# else:
#     print(f"No One Report found for {ticker}")

No One Report found for TU
