In [2]:
!pip install 'lxml[html_clean]' requests beautifulsoup4 pandas python-dotenv requests-html

Collecting pyee<12.0.0,>=11.0.0 (from pyppeteer>=0.0.14->requests-html)
  Using cached pyee-11.1.1-py3-none-any.whl.metadata (2.8 kB)
Using cached pyee-11.1.1-py3-none-any.whl (15 kB)
Installing collected packages: pyee
  Attempting uninstall: pyee
    Found existing installation: pyee 12.0.0
    Uninstalling pyee-12.0.0:
      Successfully uninstalled pyee-12.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
playwright 1.47.0 requires pyee==12.0.0, but you have pyee 11.1.1 which is incompatible.[0m[31m
[0mSuccessfully installed pyee-11.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import time
import asyncio
import pandas as pd
from dotenv import load_dotenv
import os
import re

# Load the environment variables from the .env file
load_dotenv()

# Access the username and password from the environment variables
username = os.getenv('USERNAME')  # Make sure to define your credentials in a .env file
password = os.getenv('PASSWORD')



# Async function to create session and login
async def create_session_and_login():
    # Create an AsyncHTMLSession object
    session = AsyncHTMLSession()

    # URL of the login page
    login_url = "https://www.marketscreener.com/login/"

    # Replace with the actual form data required for login
    payload = {
        'login': username,  # Use the actual login field from the form
        'password': password
    }

    # Headers to mimic a browser request
    headers = {'User-Agent': 'Mozilla/5.0'}

    # Submit login form
    response = await session.post(login_url, data=payload, headers=headers)

    # Render JavaScript (if required by the page)
    await response.html.arender()

    # Check if login was successful by looking for "logout" in the page
    if "logout" in response.text:
        print("Login successful!")
    else:
        print("Login failed. Check your credentials or login process.")
    
    await asyncio.sleep(10)  # Be polite to the server by adding a small delay
    

    return session

# Async function to get recommendation URLs from a page
async def get_recommendation_urls_from_page(session, url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    # Use session.get to maintain the login state
    page = await session.get(url, headers=headers)
    
    # Render JavaScript content (important for dynamically-loaded content)
    await page.html.arender()

    # Use BeautifulSoup to parse the rendered content
    soup = BeautifulSoup(page.html.html, 'html.parser')

    # Print session cookies for debugging
    print(session.cookies.get_dict())

    # Find all recommendation links (adjust the HTML structure if necessary)
    links = soup.find_all('a', href=True)
    recommendation_urls = []
    
    for link in links:
        href = link['href']
        # Only include links with '/quote/stock/' and '/news/' but exclude '/news/hot-news/'
        if '/quote/stock/' in href and '/news/' in href and '/news/hot-news/' not in href:
            full_url = 'https://www.marketscreener.com' + href
            recommendation_urls.append(full_url)

    return recommendation_urls

# Async function to loop through multiple pages and get recommendation URLs
async def get_all_recommendation_urls(base_url, cf_param, session, max_pages=1):
    all_recommendation_urls = []
    
    for p in range(1, max_pages + 1):
        # Correctly formatted URL with both p and cf parameters
        page_url = f"{base_url}?p={p}&cf={cf_param}"
        print(f"Scraping page {p}: {page_url}")
        
        # Get recommendation URLs from the current page
        recommendation_urls = await get_recommendation_urls_from_page(session, page_url)
        
        # If no URLs are found on this page, stop the loop
        if not recommendation_urls:
            print(f"No more recommendations found at page {p}. Stopping.")
            break
        
        all_recommendation_urls.extend(recommendation_urls)
        await asyncio.sleep(2)  # Be polite to the server by adding a small delay

    return all_recommendation_urls

# Async function to scrape individual recommendation pages
async def scrape_recommendation_page(url, session):
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        # Use session.get instead of requests.get to maintain the login state
        page = await session.get(url, headers=headers)

        # Render JavaScript (if required)
        await page.html.arender()

        # Parse the page with BeautifulSoup
        soup = BeautifulSoup(page.html.html, 'html.parser')
    except Exception as e:
        print(f"Error fetching page {url}: {e}")
        return None
    
    # Extract the published date
    date_div = soup.find('div', class_='c-6 mb-15')
    published_date = date_div.get_text().strip() if date_div else 'No Date'
    
    # Check if the published date is in "September ... 2024"
    if re.search(r'September.*2024', published_date) or re.search(r'\d{2}\.09\.2024', published_date) or re.search(r'\d{2}\.10\.2024', published_date) or re.search(r'October.*2024', published_date):
        print(f"Skipping page {url}: Published in September 2024")
        return None  # Skip this page

    # Extract the title
    title_tag = soup.find('h1', class_='title title__primary mb-15 txt-bold')
    title = title_tag.get_text().strip() if title_tag else 'No Title'

    # Extract the full text
    full_text_div = soup.find('div', class_='txt-s4 article-text')
    full_text = full_text_div.get_text().strip() if full_text_div else 'No Content'

    # Skip if the title or full text is 'No Content'
    if full_text == 'No Content':
        print(f"Skipping page {url}: Missing content")
        return None

    # Extract the source
    source_div = soup.find('div', class_='c-auto mb-15 txt-align-right txt-s2')
    source = source_div.get_text().strip() if source_div else 'No Source'

    # Extract the company name
    company_name_header = soup.find('h2', class_='m-0 txt-s1 txt-b5')
    company_name = company_name_header.get_text().strip() if company_name_header else 'No Company Name'

    additional_infos = soup.find_all('div', class_='c-auto txt-align-right txt-bold')
    if additional_infos:
        additional_infos_all = [info.get_text().strip() for info in additional_infos]
        mean_consensus = additional_infos_all[0] if len(additional_infos_all[0]) > 0 else 'No Mean Consensus'
        number_of_analysts = additional_infos_all[1] if len(additional_infos_all[1]) > 0 else 'No Number of Analysts'
        last_closed_price = additional_infos_all[2] if len(additional_infos_all[2]) > 0 else 'No Last Closed price'
        average_target_price = additional_infos_all[3] if len(additional_infos_all[3]) > 0 else 'No Average Target Price'
        spread = additional_infos_all[4] if len(additional_infos_all[4]) > 0 else 'No Spread'

    # Extract company information (handle case where not found)
    company_information_badges = soup.find_all('h2', class_='m-0 badge txt-b5 txt-s1')
    if company_information_badges:
        company_information = [badge.get_text().strip() for badge in company_information_badges]
        company_name_short = company_information[0] if len(company_information) > 0 else 'No Content'
        company_id = company_information[1] if len(company_information) > 1 else 'No Content'
    else:
        company_name_short = 'No Content'
        company_id = 'No Content'

    # Extract industry information (handle case where not found)
    industry_badges = soup.find_all('h2', class_='m-0 txt-b5 txt-s1')
    if industry_badges:
        industry_information = [badge.get_text().strip() for badge in industry_badges]
        industry_general = industry_information[0] if len(industry_information) > 0 else 'No Industry General'
        industry = industry_information[1] if len(industry_information) > 1 else 'No industry tag'
    else:
        industry_general = 'No Industry General'
        industry = 'No industry tag'
    
    # Return the results as a dictionary
    return {
        'url': url,
        'title': title,
        'published_date': published_date,
        'full_text': full_text,
        'source': source,
        'company_name': company_name,
        'company_name_short': company_name_short,
        'company_id': company_id,
        'industry_general': industry_general,
        'industry': industry,
        'mean_consensus': mean_consensus,
        'number_of_analysts': number_of_analysts,
        'last_closed_price': last_closed_price,
        'average_target_price': average_target_price,
        'spread': spread
    }

# Async function to scrape all recommendations
async def scrape_all_recommendations(urls, session):
    data = []
    
    for url in urls:
        try:
            recommendation_data = await scrape_recommendation_page(url, session)
            if recommendation_data:  # Only add if not None (i.e., not skipped)
                data.append(recommendation_data)
                print(f"Scraped: {recommendation_data['title']}")
        except Exception as e:
            print(f"Error scraping page {url}: {e}")
            await asyncio.sleep(2)
    
    return data

# Async function to run the whole process
async def main():
    timeframe_cf_param = 'aVQwZTQzL1hkU0JOTloyNTNWTERkK0dzUlJtNWk0VjdHaHhrMk9LOXVSUkRpMXBLa3o2b0xhcHNzT3ZwdnF3VDhEbjhTelpKMlNMcitDNFNORE5aT2REL0NsZWlENytrK2EySElPUTh6U3R6MDhmdm10UkdPSUtmQldWTnhvVnY'
    base_url = 'https://www.marketscreener.com/news/companies/recommandations/'
    
    # Create session and log in
    session = await create_session_and_login()

    # Get all recommendation URLs
    all_recommendation_urls = await get_all_recommendation_urls(base_url, timeframe_cf_param, session, max_pages=1)
    
    # Scrape all recommendation pages
    recommendation_data = await scrape_all_recommendations(all_recommendation_urls, session)

    # Convert the data into a DataFrame
    df = pd.DataFrame(recommendation_data)
    print(df.head())  # Preview the DataFrame
    # Save the DataFrame to a CSV file
    df.to_csv('/Users/oskarroeske/Masterthesis/scraped_data_us_market/analyst_recommendations_login_test3.csv', index=False)
    print("Data saved")

# Execute the main function
await main()


Login successful!
Scraping page 1: https://www.marketscreener.com/news/companies/recommandations/?p=1&cf=aVQwZTQzL1hkU0JOTloyNTNWTERkK0dzUlJtNWk0VjdHaHhrMk9LOXVSUkRpMXBLa3o2b0xhcHNzT3ZwdnF3VDhEbjhTelpKMlNMcitDNFNORE5aT2REL0NsZWlENytrK2EySElPUTh6U3R6MDhmdm10UkdPSUtmQldWTnhvVnY
{'PHPSESSID': 'm5nlf691820pq30i3ng73mbbjg', 'zb_auth': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6ImFrSlhUVVkyYmtSTGVFeFlhMmRyYXpReVYwc3hRVDA5IiwiaWF0IjoxNzI5NTAzMjM2fQ.GGkyouWVWx-PS8YQnHgCi6dTfLYZ1fKFWs8N4DR1y4g', 'zb_membre': '1', 'pv_r0': '702', 'pv_r0_date': '2024-10-14', 'hmv': '028ca6183713b1f7a3b85584c8e7d0bf67d57634', 'pv_r0_rand': '10'}
Skipping page https://www.marketscreener.com/quote/stock/FASTENAL-COMPANY-4901/news/Fastenal-Company-Reports-Earnings-Results-for-the-Third-Quarter-and-Nine-Months-Ended-September-30-48059209/: Published in September 2024
Skipping page https://www.marketscreener.com/quote/stock/ENNIS-INC-12395/news/Detachement-de-48009410/: Published in September 2024
Skipping page http

Task exception was never retrieved
future: <Task finished name='Task-41865' coro=<Connection._async_send() done, defined at /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyppeteer/connection.py:69> exception=RuntimeError('This event loop is already running')>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyppeteer/connection.py", line 73, in _async_send
    await self.connection.send(msg)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/websockets/legacy/protocol.py", line 647, in send
    await self.write_frame(True, opcode, data)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/websockets/legacy/protocol.py", line 1213, in write_frame
    self.write_frame_sync(fin, opcode, data)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/websockets/legacy/protocol.py", l

RuntimeError: This event loop is already running