In [1]:
from  bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json
import re
import datetime

import time

from tqdm import tqdm
import os
import logging
import unicodedata
from unidecode import unidecode
#from Database import MongoDB
Headers=({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' , 'Accept-language':'en-US , en;q=0.5'})
URL = "https://www.chittorgarh.com/report/mainboard-ipo-list-in-india-bse-nse/83/"
PRD_URL = "https://www.chittorgarh.com/ipo/netweb-technologies-india-ipo/1459/"

In [2]:
def get_webpage_soup(URL):
    try: 
        webpage=requests.get(URL,headers=Headers)
    except Exception as e:
        logging.error(f"-Page Unavailable : {e}") 
    #Creating initial soup file
    soup = BeautifulSoup(webpage.content,"html.parser")
    return soup
    #searching for product links available in the page

In [3]:
soup = get_webpage_soup(URL)

In [4]:
table = soup.find("table", attrs={'class': 'table table-bordered table-striped table-hover w-auto'})
links = table.find_all('a')

href_list = []
for link in links:
    href = link.get('href')
    if href and href.startswith("https://www.chittorgarh.com/ipo/"):
        href_list.append(href)



In [5]:


def soup_table_data(table):
    data = {}
    if table is None:
        data = {"No Data Available" : "No Data"}
    else:
        rows = table.find_all('tr')
        if rows:
            for row in rows:
                cells = row.find_all(['th', 'td'])
                for cell in cells:
                    try:
                        key = cells[0].text.strip()
                    except:
                        key = "NA"
                    try:
                        value = cell.text.strip()
                    except:
                        value = "NA"
                    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('utf-8')
                    value = re.sub(r'[^\x00-\x7F\u20b9]+', '', value) 
                    value = re.sub(r'[^\x00-\x7F]+', '', value)  # Remove non-ASCII characters
                    value = unidecode(value)
                    key = re.sub(' ', '_' , key)
                    data[key] = value
        else:
            data = {"No Data Available" : "No data"}
    return data


In [6]:

def extract_ipo_data(soup):
    company_name = soup.find('h2', itemprop='about', class_='border-bottom').text.replace(" Details",'')
    tables = soup.find_all("table", attrs={'class':"table table-bordered table-striped table-hover w-auto"})
    all_table_data = {"Company Name": company_name}
    for table in tables:
        table_data = soup_table_data(table)
        #print(table_data)
        all_table_data.update(table_data)
    json_data = json.dumps(all_table_data,indent=4)
    with open(f'{company_name}.json', 'w') as file:
        json.dump(all_table_data, file, indent=4, ensure_ascii= False)

In [7]:
for link in href_list:
    ipo_soup = get_webpage_soup(link)
    extract_ipo_data(ipo_soup)
    

In [111]:
import pandas as pd
import requests
import json
from pandas import json_normalize


url = 'https://www.chittorgarh.com/ipo/tvs-supply-chain-solutions-ipo/1475/'
Headers=({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' , 'Accept-language':'en-US , en;q=0.5'})
response = requests.get(url, headers=Headers)
tables = pd.read_html(response.text)
data = {}  # Dictionary to store tables

# Loop through each DataFrame and store them in the dictionary
for i, table_df in enumerate(tables):
    table_name = f"table_{i+1}"
    data[table_name] = table_df.to_dict(orient='records')

# Save the dictionary as a JSON file
with open('tables_data.json', 'w') as json_file:
    json.dump(data, json_file, indent=4, ensure_ascii= False)

print("Tables scraped and stored in JSON format.")

combined_data = []
for table_df in tables:
    table_data = table_df.to_dict(orient='records')
    combined_data.extend(table_data)

# Save the combined list of dictionaries as a JSON file
with open('combined_tables_data.json', 'w') as json_file:
    json.dump(combined_data, json_file, indent=4 , ensure_ascii = False)
original_data = combined_data
transformed_data = {}

# Iterate through the original data
for item in original_data:
    if 0 in item and 1 in item:
        # Handle key-value pairs
        key = item[0]
        value = item[1]
        transformed_data[key] = value
    elif 'Application' in item:
        # Handle applications
        application_type = item['Application']
        application_info = {
            'Lots': item['Lots'],
            'Shares': item['Shares'],
            'Amount': item['Amount']
        }
        if 'Applications' not in transformed_data:
            transformed_data['Applications'] = []
        transformed_data['Applications'].append({application_type: application_info})
    elif 'KPI' in item:
        # Handle KPIs
        kpi = item['KPI']
        value = item['Values']
        if 'Key_Performance_Indicators' not in transformed_data:
            transformed_data['Key_Performance_Indicators'] = {}
        transformed_data['Key_Performance_Indicators'][kpi] = value
    elif 'Category' in item:
        # Handle subscription details
        category = item['Category']
        subscription = item['Subscription (times)']
        if 'Subscription_Details' not in transformed_data:
            transformed_data['Subscription_Details'] = []
        transformed_data['Subscription_Details'].append({category: subscription})
    elif 'Review By' in item:
        # Handle reviews
        review_type = item['Review By']
        review_data = {
            'Subscribe': item['Subscribe'],
            'Neutral': item['Neutral'],
            'Avoid': item['Avoid']
        }
        if 'Reviews' not in transformed_data:
            transformed_data['Reviews'] = {}
        transformed_data['Reviews'][review_type] = review_data
    else:
        # Handle other cases
        # Add custom logic here based on your data structure
        pass

# Save the combined list of dictionaries as a JSON file
with open('transformed_combined_tables_data.json', 'w') as json_file:
    json.dump(transformed_data, json_file, indent=4 , ensure_ascii = False)
with open('transformed_combined_tables_data.json', 'r') as json_file:
    json_data = json.load(json_file)



# Flatten nested structures using json_normalize
flattened_data = json_normalize(json_data, sep='_')
# Flatten the nested "Applications" list
applications = flattened_data.pop('Applications')  # Remove the original "Applications" column
for i, app in enumerate(applications[0]):
    k = list(applications[0][i].keys())[0]
    if k == "Lot Size Calculator":
        continue
    app_data = list(app.values())[0]  # Get the application data dictionary
    for key, value in app_data.items():
        flattened_data[f'Applications_{k}_{key}'] = value

        
# Create a DataFrame
df = pd.DataFrame(flattened_data)

Tables scraped and stored in JSON format.


In [113]:
df.T

Unnamed: 0,0
IPO Date,"Aug 10, 2023 to Aug 14, 2023"
Listing Date,"Wednesday, 23 August 2023"
Face Value,₹1 per share
Price,₹187 to ₹197 per share
Lot Size,76 Shares
...,...
Applications_S-HNI (Max)_Shares,5016
Applications_S-HNI (Max)_Amount,"₹988,152"
Applications_B-HNI (Min)_Lots,67
Applications_B-HNI (Min)_Shares,5092
