In [33]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [30]:
# Starting URL to scrape the 20 URLs
starting_url = 'https://www.mse.mn/en/mse_top_20/266'

# Fetch the HTML content of the webpage
response = requests.get(starting_url)
html_content = response.content

# Check if the request was successful (status code 200)
if response.status_code == 200:
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table', class_='table dividend trade_table table-bordered table-striped table-hover table-condensed')

    # Check if the table is found
    if table:
        tbody = table.find('tbody')

        # Check if tbody is found
        if tbody:
            urls = []
            names = []  # New list to store the text from <a> tags
            for row in tbody.find_all('tr'):
                link = row.find('a')
                if link:
                    href = link.get('href')
                    full_url = f"https://www.mse.mn{href}"
                    urls.append(full_url)

                    # Extract text from <a> tag and append to names list
                    name_text = link.get_text(strip=True)
                    names.append(name_text)

            # Create DataFrame with URLs and Names columns
            url_df = pd.DataFrame({'URLs': urls, 'Names': names})
            
            # Print or display the DataFrame
            print(url_df)
            
            # Optionally, save the DataFrame to a CSV file
            url_df.to_csv('url_data_with_names.csv', index=False)
        else:
            print("No tbody found in the table.")
    else:
        print("No table with the specified class found.")
else:
    print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")


# List to store DataFrames for each URL
dfs = []

# Loop over each row in the DataFrame and scrape data
for i, row in url_df.iterrows():
    # Extract the URL and name from the row
    url = row['URLs']
    company_name = row['Names']

    # Fetch the HTML content of the webpage
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        html = response.text

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        # Find the table with the specified class
        table = soup.find('table', class_='table table-bordered trade_history_result table-striped table-hover table-condensed')

        # Check if the table is found
        if table:
            # Find the table body
            tbody = table.find('tbody')

            # Check if tbody is found
            if tbody:
                # List to store data for each row
                data = []

                # Iterate over each row in the table body and extract the desired data
                for row in tbody.find_all('tr'):
                    # Find all 'td' elements in the current row
                    columns = row.find_all('td')

                    # Extract the 2nd, 6th, and 8th 'td' elements
                    if len(columns) >= 8:
                        second_column = columns[1].text.strip()
                        sixth_column = columns[5].text.strip()
                        eighth_column = columns[7].text.strip()

                        # Append the data as a dictionary to the list
                        data.append({'Highest Price': second_column, 'Volume': sixth_column, 'Date': eighth_column, 'Company': company_name})

                # Convert the list of dictionaries to a DataFrame
                df = pd.DataFrame(data)

                # Append the DataFrame to the list
                dfs.append(df)

                # Optionally, you can display the DataFrame
                print(f"Data for {company_name}:\n{df}\n" + "="*40 + "\n")
            else:
                print(f"No tbody found in the table for {company_name}")
        else:
            print(f"No table with the specified class found for {company_name}")
    else:
        print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")

# Combine all DataFrames into a single DataFrame if needed
combined_df = pd.concat(dfs, ignore_index=True)

                                 URLs Names
0    https://www.mse.mn/en/company/90   APU
1   https://www.mse.mn/en/company/562  GLMT
2   https://www.mse.mn/en/company/326  AARD
3   https://www.mse.mn/en/company/546  ERDN
4   https://www.mse.mn/en/company/553   INV
5   https://www.mse.mn/en/company/564   SBM
6   https://www.mse.mn/en/company/557  CUMN
7   https://www.mse.mn/en/company/354   GOV
8   https://www.mse.mn/en/company/484   UID
9   https://www.mse.mn/en/company/458   TTL
10  https://www.mse.mn/en/company/549   TUM
11  https://www.mse.mn/en/company/551   MFC
12  https://www.mse.mn/en/company/547  MNDL
13  https://www.mse.mn/en/company/135   SUU
14  https://www.mse.mn/en/company/558  BOGD
15  https://www.mse.mn/en/company/550   ADB
16  https://www.mse.mn/en/company/548   AIC
17  https://www.mse.mn/en/company/541   MNP
18  https://www.mse.mn/en/company/561  SEND
19  https://www.mse.mn/en/company/554  BODI


In [31]:
url_df

Unnamed: 0,URLs,Names
0,https://www.mse.mn/en/company/90,APU
1,https://www.mse.mn/en/company/562,GLMT
2,https://www.mse.mn/en/company/326,AARD
3,https://www.mse.mn/en/company/546,ERDN
4,https://www.mse.mn/en/company/553,INV
5,https://www.mse.mn/en/company/564,SBM
6,https://www.mse.mn/en/company/557,CUMN
7,https://www.mse.mn/en/company/354,GOV
8,https://www.mse.mn/en/company/484,UID
9,https://www.mse.mn/en/company/458,TTL


Data for APU:
     Highest Price  Volume        Date Company
0            3,700     510  2014-09-16     APU
1            3,700     290  2014-09-17     APU
2            3,660     250  2014-09-18     APU
3            3,750     300  2014-09-19     APU
4            3,700   4,015  2014-09-22     APU
...            ...     ...         ...     ...
2273         1,119   8,203  2023-12-06     APU
2274         1,117  55,971  2023-12-07     APU
2275         1,117   8,843  2023-12-08     APU
2276         1,113  16,861  2023-12-11     APU
2277         1,112   7,658  2023-12-12     APU

[2278 rows x 4 columns]

Data for GLMT:
    Highest Price     Volume        Date Company
0           1,450  1,219,005  2022-12-09    GLMT
1           1,300    733,133  2022-12-12    GLMT
2           1,285    530,583  2022-12-12    GLMT
3           1,230    630,639  2022-12-13    GLMT
4           1,200    184,979  2022-12-14    GLMT
..            ...        ...         ...     ...
251           844      7,989  2023-12-