In [75]:
!pip install requests beautifulsoup4 pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [87]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

def get_recommendation_urls_from_page(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find all recommendation links (adjust the HTML structure if necessary)
    links = soup.find_all('a', href=True)
    recommendation_urls = []
    
    for link in links:
        href = link['href']
        # Only include links with '/quote/stock/' and '/news/' but exclude '/news/hot-news/'
        if '/quote/stock/' in href and '/news/' in href and '/news/hot-news/' not in href:
            full_url = 'https://www.marketscreener.com' + href
            recommendation_urls.append(full_url)

    return recommendation_urls

# Function to loop through multiple pages with correct URL formatting
def get_all_recommendation_urls(base_url, cf_param, max_pages=100):
    all_recommendation_urls = []
    
    for p in range(1, max_pages + 1):
        # Correctly formatted URL with both p and cf parameters
        page_url = f"{base_url}?p={p}&cf={cf_param}"
        print(f"Scraping page {p}: {page_url}")
        recommendation_urls = get_recommendation_urls_from_page(page_url)
        
        # If no URLs are found on this page, stop the loop
        if not recommendation_urls:
            print(f"No more recommendations found at page {p}. Stopping.")
            break
        
        all_recommendation_urls.extend(recommendation_urls)
        time.sleep(2)  # Be polite to the server by adding a small delay

    return all_recommendation_urls



In [88]:
# Example Usage of Part 1:
base_url = 'https://www.marketscreener.com/news/companies/recommandations/'
cf_param = 'aVQwZTQzL1hkU0JOTloyNTNWTERkd2VZaU85L21hdjAyd2o1UFlhNnF1UEJHZkdOcjlqZG50WllEL25yUGZkd3g3Q3FYY1lIeE4vVXlDNkJUbExWRzVtNmtMV09wdkc3eW51R2VYY3FmaDg9'
all_recommendation_urls = get_all_recommendation_urls(base_url, cf_param, max_pages=2)

print(f"Total recommendation URLs found: {len(all_recommendation_urls)}")


Scraping page 1: https://www.marketscreener.com/news/companies/recommandations/?p=1&cf=aVQwZTQzL1hkU0JOTloyNTNWTERkd2VZaU85L21hdjAyd2o1UFlhNnF1UEJHZkdOcjlqZG50WllEL25yUGZkd3g3Q3FYY1lIeE4vVXlDNkJUbExWRzVtNmtMV09wdkc3eW51R2VYY3FmaDg9
Scraping page 2: https://www.marketscreener.com/news/companies/recommandations/?p=2&cf=aVQwZTQzL1hkU0JOTloyNTNWTERkd2VZaU85L21hdjAyd2o1UFlhNnF1UEJHZkdOcjlqZG50WllEL25yUGZkd3g3Q3FYY1lIeE4vVXlDNkJUbExWRzVtNmtMV09wdkc3eW51R2VYY3FmaDg9
Total recommendation URLs found: 172


In [82]:
# Function to scrape the title, published date, and other details
def scrape_recommendation_page(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Extract the title
    title_tag = soup.find('h1', class_='title title__primary mb-15 txt-bold')
    title = title_tag.get_text().strip() if title_tag else 'No Title'
    
    # Extract the published date
    date_div = soup.find('div', class_='c-6 mb-15')
    published_date = date_div.get_text().strip() if date_div else 'No Date'
    
    # Extract the full text
    full_text_div = soup.find('div', class_='txt-s4 article-text')
    full_text = full_text_div.get_text().strip() if full_text_div else 'No Content'

    # Extract the source
    source_div = soup.find('div', class_='c-auto mb-15 txt-align-right txt-s2')
    source = source_div.get_text().strip() if source_div else 'No Source'

    # Extract the company name
    company_name_header = soup.find('h2', class_='m-0 txt-s1 txt-b5')
    company_name = company_name_header.get_text().strip() if company_name_header else 'No Company Name'

    # Extract company information
    company_information_badges = soup.find_all('h2', class_='m-0 badge txt-b5 txt-s1')
    company_information = [badge.get_text().strip() for badge in company_information_badges] if company_information_badges else ['No Content']
    company_name_short = company_information[0] if len(company_information) > 0 else 'No Content'
    company_id = company_information[1] if len(company_information) > 1 else 'No Content'

    # Extract industry information
    industry_badges = soup.find_all('h2', class_='m-0 txt-b5 txt-s1')
    industry_information = [badge.get_text().strip() for badge in industry_badges] if industry_badges else ['No Content']
    industry_general = industry_information[0] if len(industry_information) > 0 else 'No Industry General'
    industry = industry_information[1] if len(industry_information) > 1 else 'No industry tag'
    
    # Return the results as a dictionary
    return {
        'url': url,
        'title': title,
        'published_date': published_date,
        'full_text': full_text,
        'source': source,
        'company_name': company_name,
        'company_name_short': company_name_short,
        'company_id': company_id,
        'industry_general': industry_general,
        'industry': industry
    }

# Scrape all the recommendations from the provided URLs
def scrape_all_recommendations(urls):
    data = []
    
    for url in urls:
        try:
            recommendation_data = scrape_recommendation_page(url)
            data.append(recommendation_data)
            print(f"Scraped: {recommendation_data['title']}")
        except Exception as e:
            print(f"Failed to scrape {url}: {e}")
        time.sleep(2)
    
    return data


In [85]:
# Example Usage of Part 3:
recommendation_data = scrape_all_recommendations(all_recommendation_urls)

# Convert the data into a DataFrame
df = pd.DataFrame(recommendation_data)

Scraped: The Children's Place, Inc. Reports Earnings Results for the Second Quarter and Six Months Ended August 03, 2024
Scraped: Meme stock GameStop slumps as revenue drop fans turnaround doubts
Scraped: Designer Brands Inc. Revises Earnings Guidance for the Full Year 2024
Scraped: Viva Leisure Limited signed a letter of intent to acquire 34% stake in Boutique Fitness Studio from Xponential Fitness, Inc. for AUD 2 million.
Scraped: Manchester United plc Provides Earnings Guidance for the Fiscal Year 2025
Scraped: MultiPlan Corporation Announces Resignation of Glenn R. August as Member of the Board of Directors
Scraped: Ex-dividend day
Scraped: Ex-dividend day
Scraped: Hybar LLC announced that it has received funding from Quanta Services, Inc.
Scraped: Vera Bradley, Inc. Provides Consolidated Earnings Guidance for the Fiscal Year Ending February 1, 2025
Scraped: Femasys, Inc. Secures Strategic Distribution Partnerships for Commercialization of FemaSeed for over $1.3 Million in Spanish 

In [86]:
# Save the DataFrame to a CSV file
df.to_csv('/Users/oskarroeske/Masterthesis/scraped_data/analyst_recommendations_test3.csv', index=False)
print("Data saved to 'analyst_recommendations_test3.csv'.")

Data saved to 'analyst_recommendations_test3.csv'.
