In [32]:
!pip install requests beautifulsoup4 pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [33]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

def get_recommendation_urls_from_page(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find all recommendation links (adjust the HTML structure if necessary)
    links = soup.find_all('a', href=True)
    recommendation_urls = []
    
    for link in links:
        href = link['href']
        # Only include links with '/quote/stock/' and '/news/' but exclude '/news/hot-news/'
        if '/quote/stock/' in href and '/news/' in href:
            full_url = 'https://www.marketscreener.com' + href
            recommendation_urls.append(full_url)

    return recommendation_urls

# Function to loop through multiple pages with correct URL formatting
def get_all_recommendation_urls(base_url, cf_param, max_pages=200):
    all_recommendation_urls = []
    
    for p in range(1, max_pages + 1):
        # Correctly formatted URL with both p and cf parameters
        page_url = f"{base_url}?p={p}&cf={cf_param}"
        print(f"Scraping page {p}: {page_url}")
        recommendation_urls = get_recommendation_urls_from_page(page_url)
        
        # If no URLs are found on this page, stop the loop
        if not recommendation_urls:
            print(f"No more recommendations found at page {p}. Stopping.")
            break
        
        all_recommendation_urls.extend(recommendation_urls)
        time.sleep(2)  # Be polite to the server by adding a small delay

    return all_recommendation_urls



In [34]:
# Example Usage of Part 1:
timeframe_cf_param = 'aVQwZTQzL1hkU0JOTloyNTNWTERkNXRGd3RJV1VHQWQxOTlLZ09KcndJQ25IRVlDZDVLWFI4SVNaeThFc0JuVHBjNUMySmFEMG10d2pUSjY5UkdEV04ydlVWaEJSdDFsQThScHJzOG1FVjA9'

base_url = 'https://www.marketscreener.com/news/companies/recommandations/'
cf_param = timeframe_cf_param
all_recommendation_urls = get_all_recommendation_urls(base_url, cf_param, max_pages=200)

print(f"Total recommendation URLs found: {len(all_recommendation_urls)}")


Scraping page 1: https://www.marketscreener.com/news/companies/recommandations/?p=1&cf=aVQwZTQzL1hkU0JOTloyNTNWTERkNXRGd3RJV1VHQWQxOTlLZ09KcndJQ25IRVlDZDVLWFI4SVNaeThFc0JuVHBjNUMySmFEMG10d2pUSjY5UkdEV04ydlVWaEJSdDFsQThScHJzOG1FVjA9
Scraping page 2: https://www.marketscreener.com/news/companies/recommandations/?p=2&cf=aVQwZTQzL1hkU0JOTloyNTNWTERkNXRGd3RJV1VHQWQxOTlLZ09KcndJQ25IRVlDZDVLWFI4SVNaeThFc0JuVHBjNUMySmFEMG10d2pUSjY5UkdEV04ydlVWaEJSdDFsQThScHJzOG1FVjA9
Scraping page 3: https://www.marketscreener.com/news/companies/recommandations/?p=3&cf=aVQwZTQzL1hkU0JOTloyNTNWTERkNXRGd3RJV1VHQWQxOTlLZ09KcndJQ25IRVlDZDVLWFI4SVNaeThFc0JuVHBjNUMySmFEMG10d2pUSjY5UkdEV04ydlVWaEJSdDFsQThScHJzOG1FVjA9
Scraping page 4: https://www.marketscreener.com/news/companies/recommandations/?p=4&cf=aVQwZTQzL1hkU0JOTloyNTNWTERkNXRGd3RJV1VHQWQxOTlLZ09KcndJQ25IRVlDZDVLWFI4SVNaeThFc0JuVHBjNUMySmFEMG10d2pUSjY5UkdEV04ydlVWaEJSdDFsQThScHJzOG1FVjA9
Scraping page 5: https://www.marketscreener.com/news/companies/recommand

In [35]:

# Function to scrape the title, published date, and other details

def scrape_recommendation_page(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, 'html.parser')
    except Exception as e:
        print(f"Error fetching page {url}: {e}")
        return None
    
    # Extract the published date
    date_div = soup.find('div', class_='c-6 mb-15')
    published_date = date_div.get_text().strip() if date_div else 'No Date'
    
    # Check if the published date is in "September ... 2024"
    if re.search(r'September.*2024', published_date) or re.search(r'\d{2}\.09\.2024', published_date):
        print(f"Skipping page {url}: Published in September 2024")
        return None  # Skip this page

    # Extract the title
    title_tag = soup.find('h1', class_='title title__primary mb-15 txt-bold')
    title = title_tag.get_text().strip() if title_tag else 'No Title'

    # Extract the full text
    full_text_div = soup.find('div', class_='txt-s4 article-text')
    full_text = full_text_div.get_text().strip() if full_text_div else 'No Content'

    # Skip if the title or full text is 'No Content'
    if  full_text == 'No Content':
        print(f"Skipping page {url}: Missing content")
        return None

    # Extract the source
    source_div = soup.find('div', class_='c-auto mb-15 txt-align-right txt-s2')
    source = source_div.get_text().strip() if source_div else 'No Source'

    # Extract the company name
    company_name_header = soup.find('h2', class_='m-0 txt-s1 txt-b5')
    company_name = company_name_header.get_text().strip() if company_name_header else 'No Company Name'

    # Extract company information (handle case where not found)
    company_information_badges = soup.find_all('h2', class_='m-0 badge txt-b5 txt-s1')
    if company_information_badges:
        company_information = [badge.get_text().strip() for badge in company_information_badges]
        company_name_short = company_information[0] if len(company_information) > 0 else 'No Content'
        company_id = company_information[1] if len(company_information) > 1 else 'No Content'
    else:
        company_name_short = 'No Content'
        company_id = 'No Content'

    # Extract industry information (handle case where not found)
    industry_badges = soup.find_all('h2', class_='m-0 txt-b5 txt-s1')
    if industry_badges:
        industry_information = [badge.get_text().strip() for badge in industry_badges]
        industry_general = industry_information[0] if len(industry_information) > 0 else 'No Industry General'
        industry = industry_information[1] if len(industry_information) > 1 else 'No industry tag'
    else:
        industry_general = 'No Industry General'
        industry = 'No industry tag'
    
    # Return the results as a dictionary
    return {
        'url': url,
        'title': title,
        'published_date': published_date,
        'full_text': full_text,
        'source': source,
        'company_name': company_name,
        'company_name_short': company_name_short,
        'company_id': company_id,
        'industry_general': industry_general,
        'industry': industry
    }


def scrape_all_recommendations(urls):
    data = []
    
    for url in urls:
        try:
            recommendation_data = scrape_recommendation_page(url)
            if recommendation_data:  # Only add if not None (i.e., not skipped)
                data.append(recommendation_data)
                print(f"Scraped: {recommendation_data['title']}")
        except Exception as e:
            time.sleep(2)
    
    return data



In [36]:
# Example Usage of Part 3:
recommendation_data = scrape_all_recommendations(all_recommendation_urls)

# Convert the data into a DataFrame
df = pd.DataFrame(recommendation_data)

Skipping page https://www.marketscreener.com/quote/stock/SELECTQUOTE-INC-107310073/news/SelectQuote-Inc-Reports-Earnings-Results-for-the-Fourth-Quarter-and-Full-Year-Ended-June-30-2024-47867834/: Published in September 2024
Skipping page https://www.marketscreener.com/quote/stock/PRELUDE-THERAPEUTICS-INCO-112877919/news/Prelude-Therapeutics-SMARCA2-Degrader-PRT3789-Demonstrated-Promising-Initial-Clinical-Activity-and-47868818/: Published in September 2024
Skipping page https://www.marketscreener.com/quote/stock/ADOBE-INC-4844/news/Stocks-to-Watch-Adobe-RH-Singular-Genomics-Systems-Immuneering-47863207/: Published in September 2024
Skipping page https://www.marketscreener.com/quote/stock/NOVAVAX-INC-58256108/news/Novavax-Announces-Availability-of-Updated-Covid-19-Vaccine-with-Emergency-Use-Authorization-from-FDA-47869280/: Published in September 2024
Skipping page https://www.marketscreener.com/quote/stock/UPWORK-INC-46415520/news/Engine-Capital-Sends-Letter-to-Upwork-s-Board-of-Directo

In [37]:
# Save the DataFrame to a CSV file
df.to_csv('/Users/oskarroeske/Masterthesis/scraped_data/analyst_recommendations_sept_oct_2023.csv', index=False)
print("Data saved")

Data saved
