In [12]:
!pip install 'lxml[html_clean]' requests beautifulsoup4 pandas python-dotenv requests-html


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [13]:
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import time
import asyncio
from dotenv import load_dotenv
import os

# Load the environment variables from the .env file
load_dotenv()

# Access the username and password from the environment variables
username = os.getenv('USERNAME')
password = os.getenv('PASSWORD')

# Create an async function for login and scraping
async def create_session_and_login():
    # Create an AsyncHTMLSession object
    session = AsyncHTMLSession()

    # URL of the login page
    login_url = "https://www.marketscreener.com/login/?location=/"

    # Replace with the actual form data required for login
    payload = {
        'login': username,  # Replace with your actual email
        'password': password
    }

    # Headers (optional, but can help mimic browser behavior)
    headers = {'User-Agent': 'Mozilla/5.0'}

    # Submit login form
    response = await session.post(login_url, data=payload, headers=headers)

    # Render JavaScript (if required by the page)
    await response.html.arender()

    # Check if login was successful
    if "logout" in response.text:
        print("Login successful!")
    else:
        print("Login failed. Check your credentials or login process.")

    return session

# Async function to get recommendation URLs from a page
async def get_recommendation_urls_from_page(session, url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    # Use session.get to maintain the login state
    page = await session.get(url, headers=headers)
    
    # Render JavaScript content (important for dynamically-loaded content)
    await page.html.arender()

    # Use BeautifulSoup to parse the rendered content
    soup = BeautifulSoup(page.html.html, 'html.parser')

    # Print session cookies for debugging
    print(session.cookies.get_dict())

    # Find all recommendation links (adjust the HTML structure if necessary)
    links = soup.find_all('a', href=True)
    recommendation_urls = []
    
    for link in links:
        href = link['href']
        # Only include links with '/quote/stock/' and '/news/' but exclude '/news/hot-news/'
        if '/quote/stock/' in href and '/news/' in href and '/news/hot-news/' not in href:
            full_url = 'https://www.marketscreener.com' + href
            recommendation_urls.append(full_url)

    return recommendation_urls

# Async function to loop through multiple pages and get recommendation URLs
async def get_all_recommendation_urls(base_url, cf_param, session, max_pages=10):
    all_recommendation_urls = []
    
    for p in range(1, max_pages + 1):
        # Correctly formatted URL with both p and cf parameters
        page_url = f"{base_url}?p={p}&cf={cf_param}"
        print(f"Scraping page {p}: {page_url}")
        
        # Get recommendation URLs from the current page
        recommendation_urls = await get_recommendation_urls_from_page(session, page_url)
        
        # If no URLs are found on this page, stop the loop
        if not recommendation_urls:
            print(f"No more recommendations found at page {p}. Stopping.")
            break
        
        all_recommendation_urls.extend(recommendation_urls)
        await asyncio.sleep(2)  # Be polite to the server by adding a small delay

    return all_recommendation_urls


# Run everything in an async event loop
async def main():
    timeframe_cf_param = 'aVQwZTQzL1hkU0JOTloyNTNWTERkK0dzUlJtNWk0VjdHaHhrMk9LOXVSUkRpMXBLa3o2b0xhcHNzT3ZwdnF3VDhEbjhTelpKMlNMcitDNFNORE5aT2REL0NsZWlENytrK2EySElPUTh6U3R6MDhmdm10UkdPSUtmQldWTnhvVnY'

    base_url = 'https://www.marketscreener.com/news/companies/recommandations/'
    cf_param = timeframe_cf_param

    # Create session and log in
    session = await create_session_and_login()

    # Get all recommendation URLs
    all_recommendation_urls = await get_all_recommendation_urls(base_url, cf_param, session, max_pages=10)

    print(f"Total recommendation URLs found: {len(all_recommendation_urls)}")

# Execute the main function in the Jupyter Notebook event loop
await main()


In [14]:

# Run everything in an async event loop
async def main():
    timeframe_cf_param = 'aVQwZTQzL1hkU0JOTloyNTNWTERkK0dzUlJtNWk0VjdHaHhrMk9LOXVSUkRpMXBLa3o2b0xhcHNzT3ZwdnF3VDhEbjhTelpKMlNMcitDNFNORE5aT2REL0NsZWlENytrK2EySElPUTh6U3R6MDhmdm10UkdPSUtmQldWTnhvVnY'

    base_url = 'https://www.marketscreener.com/news/companies/recommandations/'
    cf_param = timeframe_cf_param

    # Create session and log in
    session = await create_session_and_login()

    # Get all recommendation URLs
    all_recommendation_urls = await get_all_recommendation_urls(base_url, cf_param, session, max_pages=10)

    print(f"Total recommendation URLs found: {len(all_recommendation_urls)}")

# Execute the main function in the Jupyter Notebook event loop
await main()

Login successful!
Scraping page 1: https://www.marketscreener.com/news/companies/recommandations/?p=1&cf=aVQwZTQzL1hkU0JOTloyNTNWTERkK0dzUlJtNWk0VjdHaHhrMk9LOXVSUkRpMXBLa3o2b0xhcHNzT3ZwdnF3VDhEbjhTelpKMlNMcitDNFNORE5aT2REL0NsZWlENytrK2EySElPUTh6U3R6MDhmdm10UkdPSUtmQldWTnhvVnY
{'PHPSESSID': 'tt0pbao2c91s416bnhh1uagtne', 'zb_auth': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImFrSlhUVVkyYmtSTGVFeFlhMmRyYXpReVYwc3hRVDA5IiwiaWF0IjoxNzI5NDk4NjY2fQ.A_czE15yar8RSavrMmRtUMpkhQqHGo0Jd7aQpCb4aZ4', 'zb_membre': '1', 'pv_r0': '3', 'pv_r0_date': '2024-10-14', 'hmv': 'a53106eceacf4ae7a1d0de999b4ff8f5dce62aaa', 'pv_r0_rand': '12'}
Scraping page 2: https://www.marketscreener.com/news/companies/recommandations/?p=2&cf=aVQwZTQzL1hkU0JOTloyNTNWTERkK0dzUlJtNWk0VjdHaHhrMk9LOXVSUkRpMXBLa3o2b0xhcHNzT3ZwdnF3VDhEbjhTelpKMlNMcitDNFNORE5aT2REL0NsZWlENytrK2EySElPUTh6U3R6MDhmdm10UkdPSUtmQldWTnhvVnY
{'PHPSESSID': 'tt0pbao2c91s416bnhh1uagtne', 'zb_auth': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImFrSlhUVVkyYm

In [55]:
# Assuming you've already set up the session after logging in
# The session object should persist across all requests

async def scrape_recommendation_page(url, session):
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    try:
        # Use session.get instead of requests.get to maintain the login state
        page = await session.get(url, headers=headers)

        # Render JavaScript (if required)
        await page.html.arender()

        # Parse the page with BeautifulSoup
        soup = BeautifulSoup(page.html.html, 'html.parser')
    except Exception as e:
        print(f"Error fetching page {url}: {e}")
        return None
    
    # Extract the published date
    date_div = soup.find('div', class_='c-6 mb-15')
    published_date = date_div.get_text().strip() if date_div else 'No Date'
    
    # Check if the published date is in "September ... 2024"
    if re.search(r'September.*2024', published_date) or re.search(r'\d{2}\.09\.2024', published_date) or re.search(r'\d{2}\.10\.2024', published_date) or re.search(r'October.*2024', published_date):
        print(f"Skipping page {url}: Published in September 2024")
        return None  # Skip this page

    # Extract the title
    title_tag = soup.find('h1', class_='title title__primary mb-15 txt-bold')
    title = title_tag.get_text().strip() if title_tag else 'No Title'

    # Extract the full text
    full_text_div = soup.find('div', class_='txt-s4 article-text')
    full_text = full_text_div.get_text().strip() if full_text_div else 'No Content'

    # Skip if the title or full text is 'No Content'
    if  full_text == 'No Content':
        print(f"Skipping page {url}: Missing content")
        return None

    # Extract the source
    source_div = soup.find('div', class_='c-auto mb-15 txt-align-right txt-s2')
    source = source_div.get_text().strip() if source_div else 'No Source'

    # Extract the company name
    company_name_header = soup.find('h2', class_='m-0 txt-s1 txt-b5')
    company_name = company_name_header.get_text().strip() if company_name_header else 'No Company Name'

    # Extract additional information
    additional_infos = soup.find_all('div', class_='c-auto txt-align-right txt-bold')
    mean_consensus = additional_infos[0].get_text().strip() if len(additional_infos) > 0 else 'No Mean Consensus'
    number_of_analysts = additional_infos[1].get_text().strip() if len(additional_infos) > 1 else 'No Number of Analysts'
    last_closed_price = additional_infos[2].get_text().strip() if len(additional_infos) > 2 else 'No Last Closed Price'
    average_target_price = additional_infos[3].get_text().strip() if len(additional_infos) > 3 else 'No Average Target Price'
    spread = additional_infos[4].get_text().strip() if len(additional_infos) > 4 else 'No Spread'

    # Extract company information (handle case where not found)
    company_information_badges = soup.find_all('h2', class_='m-0 badge txt-b5 txt-s1')
    company_name_short = company_information_badges[0].get_text().strip() if len(company_information_badges) > 0 else 'No Content'
    company_id = company_information_badges[1].get_text().strip() if len(company_information_badges) > 1 else 'No Content'

    # Extract industry information
    industry_badges = soup.find_all('h2', class_='m-0 txt-b5 txt-s1')
    industry_general = industry_badges[0].get_text().strip() if len(industry_badges) > 0 else 'No Industry General'
    industry = industry_badges[1].get_text().strip() if len(industry_badges) > 1 else 'No industry tag'
    
    # Return the results as a dictionary
    return {
        'url': url,
        'title': title,
        'published_date': published_date,
        'full_text': full_text,
        'source': source,
        'company_name': company_name,
        'company_name_short': company_name_short,
        'company_id': company_id,
        'industry_general': industry_general,
        'industry': industry,
        'mean_consensus': mean_consensus,
        'number_of_analysts': number_of_analysts,
        'last_closed_price': last_closed_price,
        'average_target_price': average_target_price,
        'spread': spread
    }

# Modify this function to pass the session as well
async def scrape_all_recommendations(urls, session):
    data = []
    
    for url in urls:
        try:
            recommendation_data = await scrape_recommendation_page(url, session)
            if recommendation_data:  # Only add if not None (i.e., not skipped)
                data.append(recommendation_data)
                print(f"Scraped: {recommendation_data['title']}")
        except Exception as e:
            print(f"Error scraping page {url}: {e}")
            await asyncio.sleep(2)
    
    return data

# Example Usage of Part 3:
# Call this function as part of the async event loop
async def main_scrape(urls, session):
    recommendation_data = await scrape_all_recommendations(urls, session)

    # Convert the data into a DataFrame
    df = pd.DataFrame(recommendation_data)
    return df


In [56]:
# Example Usage of Part 3:
recommendation_data = scrape_all_recommendations(all_recommendation_urls,session)

# Convert the data into a DataFrame
df = pd.DataFrame(recommendation_data)

{'PHPSESSID': '5ulfu1fi9rfvhii8646tmedvob'}
{'PHPSESSID': '5ulfu1fi9rfvhii8646tmedvob'}
Skipping page https://www.marketscreener.com/quote/stock/APOGEE-ENTERPRISES-INC-8400/news/Apogee-Enterprises-Inc-Reports-Earnings-Results-for-the-Second-Quarter-and-Six-Months-Ended-August-48002209/: Published in September 2024
{'PHPSESSID': '5ulfu1fi9rfvhii8646tmedvob', 'pv_r0_rand': '10', 'pv_r0_date': '2024-10-05', 'pv_r0': '1'}
Skipping page https://www.marketscreener.com/quote/stock/SPIRIT-AIRLINES-INC-39143167/news/Spirit-Airlines-shares-plunge-on-report-of-potential-bankruptcy-filing-48001261/: Published in September 2024
{'PHPSESSID': '5ulfu1fi9rfvhii8646tmedvob', 'pv_r0_rand': '10', 'pv_r0_date': '2024-10-05', 'pv_r0': '2'}
Skipping page https://www.marketscreener.com/quote/stock/BARNES-GROUP-INC-11749/news/Exclusive-Apollo-Global-in-talks-to-buy-aerospace-parts-maker-Barnes-Group-sources-say-47997312/: Published in September 2024
{'PHPSESSID': '5ulfu1fi9rfvhii8646tmedvob', 'pv_r0_rand': '1

In [57]:
# Save the DataFrame to a CSV file
df.to_csv('/Users/oskarroeske/Masterthesis/scraped_data_us_market/analyst_recommendations_login_test2.csv', index=False)
print("Data saved")

Data saved
