In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time
import re

def extract_date(title):
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    for month in months:
        if month in title:
            year = re.search(r'\d{4}', title)
            if year:
                return month, year.group()
    return None, None

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")

# Setup the Chrome WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# URL of the main page
main_url = "https://www.yardimatrix.com/Media/Downloads"

# Navigate to the main page
driver.get(main_url)

# Find all relevant links
relevant_links = driver.find_elements(By.XPATH, "//a[contains(translate(text(), 'OFFICE', 'office'), 'office') and (contains(text(), 'National Report') or contains(text(), 'Office Market') or contains(text(), 'Office Report'))]")

data = []
total_links = len(relevant_links)

# Process each link
for index, link in enumerate(relevant_links, 1):
    href = link.get_attribute('href')
    print(f"Processing link {index}/{total_links}: {href}")
    
    try:
        # Open new tab
        driver.execute_script(f"window.open('{href}');")
        
        # Switch to the new tab
        driver.switch_to.window(driver.window_handles[-1])
        
        try:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            
            try:
                title_element = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, "//h3[contains(@class, 'publication-downloadtitle')]"))
                )
                title = title_element.text
            except (NoSuchElementException, TimeoutException):
                print(f"Skipping {href} - Not a report page")
                continue
            
            month, year = extract_date(title)
            
            try:
                headline_element = driver.find_element(By.XPATH, "//span[@id='dnn_ctr905_View_genUC_lbl_Description']")
                headline = headline_element.text.strip()
            except NoSuchElementException:
                headline = "N/A"
            
            try:
                blurb_element = driver.find_element(By.XPATH, "//meta[@name='DESCRIPTION']")
                blurb = blurb_element.get_attribute('content').strip()
            except NoSuchElementException:
                blurb = "N/A"
            
            # Find the download link
            try:
                download_link = driver.find_element(By.XPATH, "//a[contains(@class, 'btn-active') and contains(text(), 'Download')]").get_attribute('href')
            except NoSuchElementException:
                download_link = "N/A"
            
            data.append({
                'Month': month,
                'Year': year,
                'Headline': headline,
                'Blurb': blurb,
                'PDF_Link': download_link
            })
            
            # Get the first four words of the blurb
            blurb_preview = ' '.join(blurb.split()[:4]) + '...' if blurb != "N/A" else "N/A"
            
            print(f"Completed {index}/{total_links}")
            print(f"Headline: {headline}")
            print(f"Blurb preview: {blurb_preview}")
            print("-" * 50)
            
        except Exception as e:
            print(f"Error processing {href}: {str(e)}")
        
    finally:
        # Close the current tab and switch back to the main window
        if len(driver.window_handles) > 1:
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
    
    time.sleep(1)  # Be respectful to the server

# Create DataFrame
df = pd.DataFrame(data)

# Export to CSV
csv_filename = 'yardi_matrix_office_reports2.csv'
df.to_csv(csv_filename, index=False)

print(f"Data exported to {csv_filename}")

# Close the browser
driver.quit()

Processing link 1/75: https://www.yardimatrix.com/Media/Downloads#gp_12
Skipping https://www.yardimatrix.com/Media/Downloads#gp_12 - Not a report page
Processing link 2/75: https://www.yardimatrix.com/Publications/Download/File/5923-MatrixOfficeNationalReport-July2024
Completed 2/75
Headline: Loans Mature Into Shaky Office Market
Blurb preview: <b>Loans Mature Into Shaky...
--------------------------------------------------
Processing link 3/75: https://www.yardimatrix.com/Publications/Download/File/5779-MatrixOfficeNationalReport-June2024
Completed 3/75
Headline: Office Distress Slowly Plays Out
Blurb preview: <b>Office Distress Slowly Plays...
--------------------------------------------------
Processing link 4/75: https://www.yardimatrix.com/Publications/Download/File/5649-MatrixOfficeNationalReport-May2024
Completed 4/75
Headline: Office Struggles to Cover Debt Obligations
Blurb preview: <b>Office Struggles to Cover...
--------------------------------------------------
Processing l

In [17]:
# Read the CSV file
df = pd.read_csv('yardi_matrix_office_reports2.csv')

# Extract the headlines and convert them to a list
headlines = df['Headline'].tolist()

# Print the headlines
for i, headline in enumerate(headlines, 1):
    print(f"{i}. {headline}")


1. Loans Mature Into Shaky Office Market
2. Office Distress Slowly Plays Out
3. Office Struggles to Cover Debt Obligations
4. Coworking Grows in the Suburbs
5. How Office Utilization Varies Among Markets
6. Office Valuations Slide Amid Turmoil
7. Office Outlook Remains Turbulent
8. Tricky Time for Office Loan Maturities
9. Conversion Activity Increases, but Roadblocks Remain
10. Placemaking Drives Suburban Success
11. Large Corporations Announce In-Person Work Mandates
12. Lab Space Grows, but Pace Slows
13. Medical office maintains stability
14. Tech Markets See Highest Share of Remote Work
15. Coworking Demand Continues to Rise
16. Office Conversions Move Forward
17. Maturing Loans Face Headwinds
18. Office Sector to See More Distress in 2023
19. Office Continues to Transform in 2023
20. Tech Slowdown to Hinder Office Recovery
21. Interest Rate Hikes Pose Another Challenge for Office Sector
22. Coworking Demand Driving Operator Expansion
23. Life Science Growth Continues in 2022
24. 