In [2]:
""" !pip install beautifulsoup4
!pip install numpy 
!pip install pandas
!pip install requests """



In [8]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests

In [9]:
# File Paths
input_csv_file = "KnowledgeURL.csv"  # Path to CSV with URLs
output_txt_file = "../data/cleaned_data/combined_web_content.txt"  # Output text file
input_file = output_txt_file  # For cleaning step, use the previous output file
cleaned_output_file = "../data/cleaned_data/cleaned_combined_web_content.txt"  # Cleaned output text file

In [11]:
# Function to extract main content from a webpage, prioritizing the <main> tag
def extract_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        
        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # First try to extract content from the <main> tag
        main = soup.find('main')
        if main:
            text_content = main.get_text(separator="\n")
            return text_content
        
        # If no <main> tag, try to find <article> tag (common for blogs)
        article = soup.find('article')
        if article:
            text_content = article.get_text(separator="\n")
            return text_content
        
        # Try to find a common main content div by class or id
        main_content = soup.find('div', {'class': 'main-content'}) or soup.find('div', {'id': 'main-content'})
        if main_content:
            text_content = main_content.get_text(separator="\n")
            return text_content

        # Fallback: Extract text from all <p> tags if no specific containers found
        paragraphs = soup.find_all('p')
        text_content = '\n'.join([p.get_text() for p in paragraphs])
        
        return text_content
    except Exception as e:
        print(f"Failed to fetch or parse {url}: {e}")
        return ""


In [12]:
# Function to read URLs from CSV and extract text content
def extract_content_from_urls(csv_file):
    df = pd.read_csv(csv_file, header=None)  # Read CSV without headers initially
    if df.shape[1] == 1:  # If there's only one column, assume it's the URL
        df.columns = ['URL']  # Add header dynamically
    urls = df['URL']
    
    all_text_content = ""
    for url in urls:
        print(f"Extracting content from: {url}")
        text_content = extract_text_from_url(url)
        all_text_content += text_content + "\n\n"
    
    return all_text_content


In [13]:
# Save the combined text into a .txt file
def save_text_to_file(text, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(text)

# Extract content from URLs and save to output file
combined_text = extract_content_from_urls(input_csv_file)
save_text_to_file(combined_text, output_txt_file)

print(f"Content extracted and saved to {output_txt_file}")


Extracting content from: URL
Failed to fetch or parse URL: Invalid URL 'URL': No scheme supplied. Perhaps you meant https://URL?
Extracting content from: https://www.savills.co.uk/blog/article/363940/residential-property/top-tips-for-landlords-and-international-students-in-london-s-rental-market.aspx
Extracting content from: https://www.timeout.com/london/property/how-to-navigate-the-london-student-housing-market
Extracting content from: https://www.zoopla.co.uk/discover/renting/complete-guide-to-student-renting-in-london/
Failed to fetch or parse https://www.zoopla.co.uk/discover/renting/complete-guide-to-student-renting-in-london/: 403 Client Error: Forbidden for url: https://www.zoopla.co.uk/discover/renting/complete-guide-to-student-renting-in-london/
Extracting content from: https://www.londonnest.com/blog/tips-and-trick-to-find-your-student-accommodation-in-london/
Extracting content from: https://rib.co.uk/management-professional/property-advice/the-students-guide-to-finding-acc

In [14]:
# Clean empty lines from a file and save the cleaned version
def clean_empty_lines(input_file, output_file):
    # Open the input file and read its lines
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Filter out empty lines (strip removes any surrounding whitespace)
    cleaned_lines = [line for line in lines if line.strip()]

    # Write the cleaned lines to the output file
    with open(output_file, 'w') as file:
        file.writelines(cleaned_lines)

    print(f"Cleaned content written to {output_file}")

# Clean the combined text file
clean_empty_lines(input_file, cleaned_output_file)


Cleaned content written to data/cleaned_combined_web_content.txt


In [15]:
# Fetch article content from a local file
def fetch_article_content(file_path):
    try:
        # Open and read the file content
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Process the content similar to how you processed HTML content
        # Split into lines, remove extra spaces
        lines = (line.strip() for line in content.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)

        return text
    except Exception as e:
        print(f"Failed to read or process the file {file_path}: {e}")
        return ""

# Fetch and process the cleaned content
file_content = fetch_article_content(cleaned_output_file)
print(f"Fetched {len(file_content)} characters from the file.")
print("First 500 characters:")
print(file_content[:500])


Fetched 146548 characters from the file.
First 500 characters:
The Savills Blog
Top tips for landlords and international students in London’s rental market
09 July 2024
Blog Article
Contacts & Related Articles
At the start of summer, international students in their thousands begin searching for a property to rent in London. Many have never lived away from home before and navigating the process of finding somewhere isn’t always easy.
Meanwhile, landlords who’ve previously let to corporate or family tenants might see themselves letting their property to a stu


In [16]:
# Split the file content into data chunks and create a DataFrame
data = []
lines = file_content.split("\n")
for line in lines:
    if line:  # Check if the line is not empty
        data.append({"content": line})

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
df.head(3)

# Print the total number of rows in the DataFrame
print(f"Total number of rows in the DataFrame: {len(df)}")


Total number of rows in the DataFrame: 2893
