In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import string
from tqdm import tqdm

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
def fetch_page(url , retries = 5):                                          #defining function for fetching url 
    delay = 1                                                               #delay to stop requests if server refuse connection
    for _ in range(retries):
        try:
            response = requests.get(url , headers=headers ,timeout=10)
            response.raise_for_status()                                     #if error in fetching page
            return response.text                                            
        except requests.exceptions.RequestException as e:
            print(f"Error {e} ..retrying in {delay}")
            time.sleep(delay)
            delay*=2                                                        #exponential increase in delay
    print(f"Error fetching url after {retries} tries")
    return None
    

In [None]:
url  = "https://ticker.finology.in/investor"
html_content = fetch_page(url)
soup = BeautifulSoup(html_content, "html.parser")                           #creating soup

In [None]:
investor_list = []
for sp in soup.find_all("div", class_="col-12 col-md-3"):
    try:
        investor_name = sp.h4.text
    except:
        investor_name = np.nan                                              #in case of error in retrieving
    
    try:
        investor_profile_link = "https://ticker.finology.in"+sp.a.get("href")
    except:
        investor_profile_link = np.nan
    investor_list.append([investor_name,investor_profile_link])
    # print(investor_name,"|--|",investor_profile_link)
    # break


# for i in investor_list:
#     print(i)

df = pd.DataFrame(investor_list,columns=["Investor_Name","Profile_link"])     #creating datframe for storage

In [None]:
# df.dropna(inplace=True)
# df.to_csv("Investor_link_list.csv")
df = pd.read_csv("Investor_link_list.csv")

In [None]:

# Assuming fetch_page function is defined elsewhere
# Assuming df["Profile_link"] contains the profile links

for links in tqdm(df["Profile_link"]):
    # Skip if the link is NaN or invalid
    if pd.isna(links):
        continue
    
    profile_page = fetch_page(links)
    
    table_list = []

    soup = BeautifulSoup(profile_page, "html.parser")
    
    try:
        title = soup.find("h1").text.strip()
        table = soup.find("tbody")
        
        if table is not None:
            for table_rows in table.find_all("tr"):
                t_row_lst = []
                link_element = table_rows.find("a")
                if link_element is not None:
                    lnk = "https://ticker.finology.in" + link_element.get("href")
                else:
                    lnk = np.nan
                
                sp = table_rows.find_all("td")
                if len(sp) > 2:
                    tag = sp[2]
                    t_row_lst.extend([sp[1].text if len(sp) > 1 else np.nan, lnk])
                    t_row_lst.extend((tag.text.replace("\n", "").replace("\r", "").replace("Filing Due", "Filing_Due")).split())
                else:
                    t_row_lst.extend([np.nan, lnk, np.nan, np.nan])
                
                table_list.append(t_row_lst)
        else:
            table_list.append([np.nan, np.nan, np.nan, np.nan])
        
    except AttributeError:
        table_list.append([np.nan, np.nan, np.nan, np.nan])
    
    # Process the table_list as needed
    # print(title, "||")


In [None]:
df_invest = pd.DataFrame(table_list,columns=["Comp_name","Comp_link","Q1","Q2","Q3","Q4","Q5","Value"])
# df_invest = pd.DataFrame(table_list)
df_invest

df_invest.to_csv("SuperInvestorsHolding.csv")