# SEOTieto Automation

## Credentials & Imports

In [29]:
from google.cloud import bigquery
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google.oauth2.service_account import Credentials
import pandas as pd
import numpy as np
import json
import logging
from google.cloud.exceptions import NotFound
import io
from drive_ops import get_drive_service, list_files_in_drive_folder, download_file_from_drive
from dotenv import load_dotenv
load_dotenv('/Users/tis/foam/cdp/code/cfg/.env')


# Set up logging
logging.basicConfig(level=logging.INFO)

# Provide the path to your service account key file
key_path = "/Users/tis/foam/cdp/function/seo/Seotieto.json"
PROJECT_ID = 'seotieto'
DATASET_ID = 'SEOTieto'
DRIVE_FOLDER_ID = os.getenv('DRIVE_FOLDER_ID')

# Initialize the Sheets and Drive API
SCOPES = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive']
credentials = Credentials.from_service_account_file(key_path, scopes=SCOPES)
sheets_service = build('sheets', 'v4', credentials=credentials)
drive_service = build('drive', 'v3', credentials=credentials)

# Create credentials using the key file
credentials = Credentials.from_service_account_file(
    key_path,
)

# Construct a BigQuery client object
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# You can now use the `client` object to interact with BigQuery

INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0


## Loading CSVs from Drive to BigQuery

In [30]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to sanitize column names

def sanitize_column_names(columns):
    return (
        columns
        .str.lower()  # convert to lowercase
        .str.replace(' ', '_')  # replace spaces with underscores
        .str.replace(r'[^\w\s]', '_', regex=True)
        .str.replace(r'__+', '_', regex=True)
        .str.strip('_')
        .str.replace(r'^(\d+)', r'_\1', regex=True)
    )
# Function to generate schema from pandas DataFrame
def generate_schema(dataframe):
    type_mapping = {
        'object': 'STRING',
        'int64': 'INTEGER',
        'float64': 'FLOAT',
        'bool': 'BOOLEAN',
        'datetime64[ns]': 'TIMESTAMP'
    }
    schema = []
    for column_name, dtype in dataframe.dtypes.items():
        formatted_column_name = sanitize_column_names(pd.Series(column_name)).iloc[0]
        bq_type = type_mapping.get(str(dtype), 'STRING')  # Default to STRING if type not found
        schema.append(bigquery.SchemaField(formatted_column_name, bq_type))
    return schema

# Function to load data into BigQuery
def load_data_into_bigquery(bigquery_client, dataframe, dataset_id, table_id, schema):
    table_ref = bigquery_client.dataset(dataset_id).table(table_id)
    job_config = bigquery.LoadJobConfig(
        schema=schema,
        write_disposition='WRITE_TRUNCATE'
    )
    job = bigquery_client.load_table_from_dataframe(dataframe, table_ref, job_config=job_config)
    job.result()
    logging.info(f"Data loaded into BigQuery table {dataset_id}.{table_id}")

# Main function
def main():
    credentials = service_account.Credentials.from_service_account_file(key_path)
    bigquery_client = bigquery.Client(credentials=credentials, project=PROJECT_ID)
    drive_service = get_drive_service(key_path)
    
    drive_files = list_files_in_drive_folder(drive_service, DRIVE_FOLDER_ID)
    for file in drive_files:
        file_name = file['name']
        file_id = file['id']
        logging.info(f"Processing file: {file_name}")
        
        file_stream = download_file_from_drive(drive_service, file_id, file_name)
        file_stream.seek(0)
        df = pd.read_csv(file_stream)
        
        # Sanitize column names in the dataframe
        df.columns = sanitize_column_names(df.columns)
        
        # Generate schema based on the dataframe
        schema = generate_schema(df)
        table_id = file_name.split('.')[0]
        load_data_into_bigquery(bigquery_client, df, DATASET_ID, table_id, schema)
        
    logging.info("All files loaded to BigQuery successfully.")

if __name__ == "__main__":
    main()

INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:root:Processing file: eezy_internal_all.csv


Download eezy_internal_all.csv: 100%.


INFO:root:Data loaded into BigQuery table SEOTieto.eezy_internal_all
INFO:root:Processing file: divadiaz_issues_overview_report.csv


Download divadiaz_issues_overview_report.csv: 100%.


INFO:root:Data loaded into BigQuery table SEOTieto.divadiaz_issues_overview_report
INFO:root:Processing file: divadiaz_internal_html.csv


Download divadiaz_internal_html.csv: 100%.


INFO:root:Data loaded into BigQuery table SEOTieto.divadiaz_internal_html
INFO:root:Processing file: grmservices_issues_overview_report.csv


Download grmservices_issues_overview_report.csv: 100%.


INFO:root:Data loaded into BigQuery table SEOTieto.grmservices_issues_overview_report
INFO:root:Processing file: grmservices_internal_html.csv


Download grmservices_internal_html.csv: 100%.


INFO:root:Data loaded into BigQuery table SEOTieto.grmservices_internal_html
INFO:root:Processing file: eezy_internal_html.csv


Download eezy_internal_html.csv: 100%.


INFO:root:Data loaded into BigQuery table SEOTieto.eezy_internal_html
INFO:root:Processing file: neonaudit_issues_overview_report.csv


Download neonaudit_issues_overview_report.csv: 100%.


INFO:root:Data loaded into BigQuery table SEOTieto.neonaudit_issues_overview_report
INFO:root:Processing file: neonaudit_internal_all.csv


Download neonaudit_internal_all.csv: 100%.


INFO:root:Data loaded into BigQuery table SEOTieto.neonaudit_internal_all
INFO:root:All files loaded to BigQuery successfully.


## Querying the data from BigQuery

In [31]:
# Querying all the brand names and saving to a Set
query = """
    SELECT 
        SUBSTR(table_name, 1, STRPOS(table_name, '_') - 1) AS brand_name
    FROM 
        `seotieto.SEOTieto.INFORMATION_SCHEMA.TABLES`
    WHERE 
        REGEXP_CONTAINS(table_name, r'_internal_(all|html)$')
"""

# Run the query
query_job = client.query(query)

# Collect the results into a set
brand_names = {row["brand_name"] for row in query_job}

def table_exists(client, table_id):
    try:
        client.get_table(table_id)
        return True
    except NotFound:
        return False

def get_data_for_brand(brand):
    suffixes = ['_internal_html', '_internal_all']
    for suffix in suffixes:
        table_id = f"seotieto.SEOTieto.{brand}{suffix}"
        if table_exists(client, table_id):
            break
    else:
        raise ValueError(f"No valid table found for brand {brand}")

    # Query 1
    query1 = f"""
    SELECT
        address,
        word_count AS words,
        title_1_length,
        meta_description_1_length AS meta_length,
        title_1,
        meta_description_1 AS meta
    FROM `{table_id}`
    WHERE content_type IN (
        'text/html; charset=utf-8',
        'text/html;charset=UTF-8',
        'text/html; charset=iso-8859-1',
        'text/html;charset=utf-8',
        'text/html',
        'text/html; charset=UTF-8'
    )
    AND status_code = 200
    AND indexability = "Indexable"
    """

    # Query 2
    query2 = f"""
    SELECT
        issue_name AS name,
        issue_type AS type,
        issue_priority AS priority,
        urls,
        description,
        how_to_fix AS fix
    FROM `seotieto.SEOTieto.{brand}_issues_overview_report`
    WHERE issue_name NOT IN (
        'Security: Missing X-Content-Type-Options Header',
        'Security: Missing X-Frame-Options Header',
        'Security: Missing HSTS Header',
        'Security: Missing Content-Security-Policy Header',
        'Security: Missing Secure Referrer-Policy Header',
        'Security: Protocol-Relative Resource Links',
        'Security: Unsafe Cross-Origin Links',
        'Links: Pages With High External Outlinks',
        'Links: Internal Outlinks With No Anchor Text',
        'Content: Readability Very Difficult',
        'Security: Bad Content Type',
        'Page Titles: Over 561 Pixels',
        'Page Titles: Below 200 Pixels',
        'Response Codes: Internal Redirection (3xx)',
        'Directives: Noindex',
        'URL: Underscores',
        'URL: Parameters',
        'Directives: Nofollow',
        'Canonicals: Canonicalised',
        'Canonicals: Missing',
        'Security: HTTP URLs',
        'Pagination: Sequence Error',
        'Response Codes: External Client Error (4xx)',
        'Content: Readability Difficult',
        'Meta Description: Below 400 Pixels',
        'Meta Description: Below 400 Pixels'
    )
    ORDER BY issue_name ASC, urls DESC
    """

    # Run queries
    query_job1 = client.query(query1)  # Make an API request.
    query_job2 = client.query(query2)  # Make an API request.

    # Get the results and convert them into pandas DataFrames
    df1 = query_job1.result().to_dataframe()
    df2 = query_job2.result().to_dataframe()

    return df1, df2

brand_dataframes = {brand: get_data_for_brand(brand) for brand in brand_names}

## Generating Sheets

In [32]:
from dotenv import load_dotenv
load_dotenv('/Users/tis/foam/cdp/code/cfg/.env')

# Template spreadsheet ID and sheet names for conditional formatting
TEMPLATE_SPREADSHEET_ID = os.getenv('TEMPLATE_SPREADSHEET_ID')
TEMPLATE_SHEET_NAMES = ['internal_extract', 'issues_extract']

# Fetch conditional formatting rules from the template
template_conditional_formats = {
    sheet_name: sheets_service.spreadsheets().get(
        spreadsheetId=TEMPLATE_SPREADSHEET_ID, 
        ranges=sheet_name, 
        fields='sheets(conditionalFormats)'
    ).execute().get('sheets', [])[0].get('conditionalFormats', [])
    for sheet_name in TEMPLATE_SHEET_NAMES
}

# Function to create a new Google Sheet
def create_sheet(service, title, folder_id):
    try:
        file_metadata = {
            'name': title,
            'mimeType': 'application/vnd.google-apps.spreadsheet',
            'parents': [folder_id]
        }
        file = service.files().create(body=file_metadata).execute()
        return file.get('id')
    except HttpError as error:
        logging.error(f'An error occurred: {error}')
        return None

# Function to get the IDs of the sheets in a spreadsheet
def get_sheet_ids(service, spreadsheet_id):
    sheet_metadata = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
    sheets = sheet_metadata.get('sheets', '')
    return {sheet['properties']['title']: sheet['properties']['sheetId'] for sheet in sheets}

# Function to create new sheets within a Google Sheet in batch
def create_sheets_in_batch(service, spreadsheet_id, sheet_names):
    requests = [{'addSheet': {'properties': {'title': sheet_name}}} for sheet_name in sheet_names]
    body = {'requests': requests}
    try:
        service.spreadsheets().batchUpdate(spreadsheetId=spreadsheet_id, body=body).execute()
        logging.info(f'Sheets {sheet_names} created successfully in spreadsheet {spreadsheet_id}.')
    except HttpError as error:
        logging.error(f'An error occurred: {error}')

# Function to write data to multiple Google Sheets in batch
def write_data_to_sheets_in_batch(service, spreadsheet_id, sheet_dataframes):
    data = []
    for sheet_name, df in sheet_dataframes:
        df = df.replace(np.nan, '', regex=True)  # replace NaN values with an empty string
        values = [df.columns.values.tolist()] + df.values.tolist()
        data.append({
            'range': f'{sheet_name}!A1',
            'values': values
        })
    body = {
        'valueInputOption': 'RAW',
        'data': data
    }
    try:
        service.spreadsheets().values().batchUpdate(
            spreadsheetId=spreadsheet_id,
            body=body).execute()
    except HttpError as error:
        logging.error(f'An error occurred: {error}')

# Function to apply conditional formatting rules in batch
def apply_conditional_rules_in_batch(service, spreadsheet_id, sheet_formats):
    requests = []
    sheet_ids = get_sheet_ids(service, spreadsheet_id)
    for sheet_name, conditional_formats in sheet_formats:
        target_sheet_id = sheet_ids.get(sheet_name)
        for rule in conditional_formats:
            if 'ranges' in rule and isinstance(rule['ranges'], list):
                updated_rule = rule.copy()
                updated_rule['ranges'] = [{
                    **range_info, 
                    'sheetId': target_sheet_id
                } for range_info in rule['ranges']]
                requests.append({"addConditionalFormatRule": {"rule": updated_rule, "index": 0}})
    if requests:
        body = {'requests': requests}
        try:
            service.spreadsheets().batchUpdate(spreadsheetId=spreadsheet_id, body=body).execute()
        except HttpError as error:
            logging.error(f'An error occurred while applying conditional formatting: {error}')

# Main logic to create sheets for each brand and apply conditional formatting
for brand in brand_names:
    logging.info(f'Processing brand: {brand}')
    # Create a new Google Sheet in the specified folder
    spreadsheet_id = create_sheet(drive_service, brand, '12xrF81B81UORxXtKGul6GT19gpkZmt1l')
    if spreadsheet_id:
        # Create the 'internal' and 'issues' sheets in batch
        create_sheets_in_batch(sheets_service, spreadsheet_id, ['internal', 'issues'])
        # Write the data to the 'internal' and 'issues' sheets in batch
        write_data_to_sheets_in_batch(sheets_service, spreadsheet_id, [
            ('internal', brand_dataframes[brand][0]),
            ('issues', brand_dataframes[brand][1])
        ])
        # Apply the conditional formatting to the 'internal' and 'issues' sheets in batch
        apply_conditional_rules_in_batch(sheets_service, spreadsheet_id, [
            ('internal', template_conditional_formats['internal_extract']),
            ('issues', template_conditional_formats['issues_extract'])
        ])

INFO:root:Processing brand: jkaksi
INFO:root:Sheets ['internal', 'issues'] created successfully in spreadsheet 1UuMm753I9IcfHA-E0mB0LHj4nHDz9N8XrS0bGMGoYT4.
INFO:root:Processing brand: avoinsystems
INFO:root:Sheets ['internal', 'issues'] created successfully in spreadsheet 1PkeQVXm54Ty2vuHwr3N3fYZibx80DGPo-4JaCKGXnCI.
INFO:root:Processing brand: rsult
INFO:root:Sheets ['internal', 'issues'] created successfully in spreadsheet 1zkWbR82plyiwQ1k5iIYToCKDiOE8sT5pLLprd5Z5RzI.
INFO:root:Processing brand: kletta
INFO:root:Sheets ['internal', 'issues'] created successfully in spreadsheet 1A80oo0uoTtjLuDHl8Vg_sO9jKR6DnA0qXH0V4Gq4OHU.
INFO:root:Processing brand: tentrio
INFO:root:Sheets ['internal', 'issues'] created successfully in spreadsheet 1ndBKmsVDAXAPBER4SWJByartNlL_NaT0QjHNnthjldI.
INFO:root:Processing brand: topaasia
INFO:root:Sheets ['internal', 'issues'] created successfully in spreadsheet 1d95I7VEyDJfHEtVWXNTJEsj0jh9akC378tjBkwUg5p8.
INFO:root:Processing brand: upsyshopping
INFO:root

# Individual Page Auditing

In [6]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Load the credentials from the service account key file
credentials = service_account.Credentials.from_service_account_file(key_path)

# Initialize a BigQuery client with the loaded credentials
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# Function to get links for a brand
def get_links_for_brand(brand):
    # Query to get table names that match the pattern
    query_tables = f"""
    SELECT 
        table_name
    FROM 
        `seotieto.SEOTieto.INFORMATION_SCHEMA.TABLES`
    WHERE 
        REGEXP_CONTAINS(table_name, r'_internal_(all|html)$') AND
        SUBSTR(table_name, 1, STRPOS(table_name, '_') - 1) = '{brand}'
    """

    query_job = client.query(query_tables)
    tables = query_job.result().to_dataframe()['table_name'].tolist()

    print(f"Found {len(tables)} tables for brand '{brand}'.")

    links = []
    # Query each table for the links
    for table in tables:
        query_links = f"""
        SELECT
            address
        FROM `seotieto.SEOTieto.{table}` 
        WHERE status_code = 200
        """

        query_job = client.query(query_links)
        df = query_job.result().to_dataframe()

        links.extend(df['address'].tolist())

        print(f"Added {len(df)} links from table '{table}' to the list. Total links: {len(links)}.")

    return links

# Specify the brand
brand = "eezy"
links = get_links_for_brand(brand)

# Print out the links
for link in links:
    print(link)

Found 1 tables for brand 'eezy'.
Added 188 links from table 'eezy_internal_html' to the list. Total links: 188.
https://eezy.fi/
https://tyopaikat.eezy.fi/fi
https://tyopaikat.eezy.fi/fi/kirjaudu
https://flow.eezy.fi/palvelumme/
https://flow.eezy.fi/
https://flow.eezy.fi/palvelumme/johtaminen/
https://flow.eezy.fi/palvelumme/tutkimukset/
https://kevytyrittajat.eezy.fi/tyon-tilaajalle/yrityskumppanuus/
https://flow.eezy.fi/palvelumme/strategiat-ja-konseptit/
https://flow.eezy.fi/palvelumme/muutosjohtaminen/
https://farenta.eezy.fi/
https://kevytyrittajat.eezy.fi/
https://tyollisyyspalvelut.eezy.fi/
https://tyollisyyspalvelut.eezy.fi/yrityksille/
https://kevytyrittajat.eezy.fi/kevytyrittajyys/
https://tyollisyyspalvelut.eezy.fi/yrityksille/tyopaikkasuomi-koulutus/
https://tyollisyyspalvelut.eezy.fi/yrityksille/eezympaa-tyokykya/
https://tyollisyyspalvelut.eezy.fi/yrityksille/rekrytoivat-koulutukset/
https://eezy.fi/en/kampanjat/work-in-finland/
https://eezy.fi/en/
https://eezy.fi/sijoitt

In [None]:
print(links)

In [33]:
import subprocess
import time

# Function to open links
def open_links(chunk):
    for url in chunk:
        subprocess.run(["open", "-a", "Arc", url], check=True)
        time.sleep(0.2)

# Function to get the next chunk of links
def get_next_chunk():
    global links

    # Read the current index from a file
    with open('index.txt', 'r') as f:
        current_index = int(f.read())

    if current_index < len(links):
        chunk = links[current_index:current_index+20]
        current_index += 20
        open_links(chunk)
        print(f"Progress: {current_index}/{len(links)} links opened.")

        # Write the updated index back to the file
        with open('index.txt', 'w') as f:
            f.write(str(current_index))
    else:
        print("No more links to open.")

# Initialize the index file with 0 if it doesn't exist
try:
    with open('index.txt', 'x') as f:
        f.write('0')
except FileExistsError:
    pass

get_next_chunk()

Progress: 160/188 links opened.


# All URLs of domain from sitemaps

In [21]:
import csv
import concurrent.futures
import requests
from bs4 import BeautifulSoup
from requests.sessions import Session

def extract_links_from_sitemap(url, session):
    try:
        response = session.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'xml')
            urls = soup.find_all('loc')
            return [url.text for url in urls]
        else:
            print(f"Failed to fetch sitemap from {url}")
            return []
    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")
        return []

def extract_all_links(sitemap_urls):
    all_links = []
    with Session() as session:
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            futures = [executor.submit(extract_links_from_sitemap, url, session) for url in sitemap_urls]
            for future in concurrent.futures.as_completed(futures):
                links = future.result()
                all_links.extend(links)
    return all_links

def save_to_csv(links):
    with open('eezy.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['URL'])
        for link in links:
            writer.writerow([link])

# Define sitemap_urls with the list of URLs you provided
sitemap_urls = [
    "https://eezy.fi/post-sitemap.xml",
    "https://eezy.fi/page-sitemap.xml",
    "https://eezy.fi/campaign-sitemap.xml",
    "https://eezy.fi/investor_page-sitemap.xml",
    "https://eezy.fi/office-sitemap.xml",
    "https://eezy.fi/category-sitemap.xml",
    "https://eezy.fi/post_tag-sitemap.xml",
    "https://eezy.fi/author-sitemap.xml",
    "https://flow.eezy.fi/post-sitemap.xml",
    "https://flow.eezy.fi/page-sitemap.xml",
    "https://flow.eezy.fi/course-sitemap.xml",
    "https://flow.eezy.fi/case-sitemap.xml",
    "https://flow.eezy.fi/subject-sitemap.xml",
    "https://kevytyrittajat.eezy.fi/post-sitemap.xml",
    "https://kevytyrittajat.eezy.fi/page-sitemap.xml",
    "https://kevytyrittajat.eezy.fi/wp_quiz-sitemap.xml",
    "https://kevytyrittajat.eezy.fi/category-sitemap.xml",
    "https://kevytyrittajat.eezy.fi/post_tag-sitemap.xml",
    "https://tyollisyyspalvelut.eezy.fi/post-sitemap.xml",
    "https://tyollisyyspalvelut.eezy.fi/page-sitemap.xml",
    "https://tyollisyyspalvelut.eezy.fi/service-sitemap.xml",
    "https://personnel.eezy.fi/post-sitemap.xml",
    "https://personnel.eezy.fi/page-sitemap.xml",
    "https://personnel.eezy.fi/category-sitemap.xml",
    "https://personnel.eezy.fi/post_tag-sitemap.xml",
    "https://personnel.eezy.fi/job_category-sitemap.xml",
    "https://personnel.eezy.fi/job_subcategory-sitemap.xml",
    "https://personnel.eezy.fi/author-sitemap.xml"
]

# Extracting all links from sitemaps
all_links = extract_all_links(sitemap_urls)

# Save links to CSV
save_to_csv(all_links)

print("URLs saved to eezy.csv")


URLs saved to eezy.csv


In [24]:
import csv
import os

def split_list_to_csv(all_links, output_dir='CSV exports', base_filename='eazy', max_urls_per_file=500):
    """
    Splits a list of URLs into multiple CSV files, each with a maximum number of URLs.

    Parameters:
    - all_links: List of URLs to be written to CSV files.
    - output_dir: Directory where the output CSV files will be saved.
    - base_filename: Base name for output files, which will be suffixed with numbers.
    - max_urls_per_file: Maximum number of URLs per output file.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    file_count = 1
    urls_count = 0
    current_file = None
    current_writer = None

    for url in all_links:
        if urls_count % max_urls_per_file == 0:
            if current_file:
                current_file.close()
            current_file_path = os.path.join(output_dir, f"{base_filename}{file_count}.csv")
            current_file = open(current_file_path, mode='w', newline='', encoding='utf-8')
            current_writer = csv.writer(current_file)
            current_writer.writerow(['URL'])  # Write the header to each new file
            file_count += 1

        current_writer.writerow([url])
        urls_count += 1

    if current_file:
        current_file.close()

    print(f"Split completed. {file_count-1} file(s) created in the '{output_dir}' directory.")

# Example usage:
# Assume all_links is your list of URLs, e.g., all_links = ['http://example.com/page1', 'http://example.com/page2', ...]
all_links = [...]  # your list of URLs
split_list_to_csv(all_links)


Split completed. 1 file(s) created in the 'CSV exports' directory.


In [2]:
import csv
import os
import concurrent.futures
import requests
from bs4 import BeautifulSoup
from requests.sessions import Session

def extract_links_from_sitemap(url, session):
    try:
        response = session.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'xml')
            urls = soup.find_all('loc')
            return [url.text for url in urls]
        else:
            print(f"Failed to fetch sitemap from {url}")
            return []
    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")
        return []

def extract_all_links(sitemap_urls):
    all_links = []
    with Session() as session:
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            futures = [executor.submit(extract_links_from_sitemap, url, session) for url in sitemap_urls]
            for future in concurrent.futures.as_completed(futures):
                links = future.result()
                all_links.extend(links)
    return all_links

def save_to_csv_in_parts(links, output_dir='CSV exports', base_filename='eazy', max_urls_per_file=200000):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    file_count = 1
    urls_count = 0
    current_file = None
    current_writer = None

    for url in links:
        if urls_count % max_urls_per_file == 0:
            if current_file:
                current_file.close()
            current_file_path = os.path.join(output_dir, f"{base_filename}{file_count}.csv")
            current_file = open(current_file_path, mode='w', newline='', encoding='utf-8')
            current_writer = csv.writer(current_file)
            current_writer.writerow(['URL'])  # Write the header to each new file
            file_count += 1

        current_writer.writerow([url])
        urls_count += 1

    if current_file:
        current_file.close()

    print(f"Split completed. {file_count-1} file(s) created in the '{output_dir}' directory.")

# Define sitemap_urls with the list of URLs you provided
sitemap_urls = [
    "https://www.yrittajat.fi/wp-sitemap-posts-page-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-page-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-page-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-association-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-association-2.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-association-3.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-association-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-association-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-blog-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-blog-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-blog-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-class-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-class-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-contact-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-contact-2.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-contact-3.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-contact-4.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-contact-5.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-contact-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-contact-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-event-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-event-2.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-event-3.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-event-4.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-event-5.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-event-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-event-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-guide-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-guide-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-guide-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-material-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-material-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-material-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-membership-benefit-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-membership-benefit-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-membership-benefit-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-membership-service-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-membership-service-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-membership-service-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-2.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-3.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-4.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-5.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-6.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-7.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-8.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-9.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-10.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-11.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-12.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-13.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-14.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-15.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-16.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-17.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-18.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-19.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-20.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-21.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-22.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-news-23.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-news-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-news-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-organization-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-organization-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-organization-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-podcast-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-private-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-release-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-release-2.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-release-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-research-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-posts-research-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-posts-research-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-statement-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-statement-2.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-statement-3.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-statement-4.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-2.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-3.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-4.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-5.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-6.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-7.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-8.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-9.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-10.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-11.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-12.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-13.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-14.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-15.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-16.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-17.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-18.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-19.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-20.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-21.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-22.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-23.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-24.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-25.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-26.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-27.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-28.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-29.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-30.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-31.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-32.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-33.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-34.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-35.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-36.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-37.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-38.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-39.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-40.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-41.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-42.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-43.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-44.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-45.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-46.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-47.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-48.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-49.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-50.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-51.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-52.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-53.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-54.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-55.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-56.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-57.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-58.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-59.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-60.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-company-61.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-2.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-3.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-4.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-5.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-6.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-7.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-8.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-9.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-10.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-11.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-12.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-13.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-14.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-15.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-16.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-17.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-18.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-19.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-20.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-21.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-22.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-23.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-24.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-25.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-26.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-27.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-28.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-29.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-30.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-31.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-32.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-33.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-34.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-35.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-36.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-37.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-38.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-39.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-40.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-41.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-42.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-43.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-44.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-45.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-46.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-47.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-48.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-49.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-50.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-51.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-52.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-53.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-54.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-55.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-56.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-57.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-58.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-59.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-60.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-61.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-62.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-63.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-64.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-65.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-66.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-67.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-68.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-69.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-70.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-71.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-72.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-73.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-74.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-75.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-76.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-77.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-78.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-79.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-80.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-81.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-82.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-83.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-84.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-85.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-86.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-87.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-88.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-89.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-90.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-91.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-92.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-93.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-94.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-95.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-96.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-97.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-98.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-99.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-100.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-101.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-102.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-103.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-104.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-105.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-106.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-107.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-108.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-109.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-110.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-111.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-112.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-113.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-114.xml",
    "https://www.yrittajat.fi/wp-sitemap-posts-person-115.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-media_category-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-media_category-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-media_category-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-tax-association-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-tax-association-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-tax-association-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-benefit-category-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-benefit-category-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-benefit-category-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-tax-blog-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-tax-blog-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-tax-blog-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-contact-group-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-contact-group-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-event-type-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-event-type-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-event-type-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-guide-topic-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-keyword-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-keyword-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-keyword-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-material-type-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-material-type-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-material-type-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-tax-organization-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-tax-organization-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-tax-organization-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-podcast-name-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-region-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-region-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-region-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-research-topic-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-service-category-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-service-category-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-statement-topic-1.xml",
    "https://www.yrittajat.fi/wp-sitemap-taxonomies-theme-1.xml",
    "https://www.yrittajat.fi/en/wp-sitemap-taxonomies-theme-1.xml",
    "https://www.yrittajat.fi/sv/wp-sitemap-taxonomies-theme-1.xml"
]

# Extracting all links from sitemaps
all_links = extract_all_links(sitemap_urls)

# Splitting and saving the links to CSV in parts
save_to_csv_in_parts(all_links)


Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-2.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-4.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-5.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-6.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-10.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-7.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-3.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-8.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-1.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-9.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-person-11.xml
Failed to fetch sitemap from https://www.yrittajat.fi/wp-sitemap-posts-per

In [5]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/Users/tis/Dendron/notes/SEOTieto/CSV exports/eazy1.csv')

# Check for duplicates in the first column
duplicates = df[df.duplicated(df.columns[0])]

# Print the duplicates
print(duplicates)

Empty DataFrame
Columns: [URL]
Index: []


In [3]:
import pandas as pd
import glob

def merge_and_sum_csv_files(input_pattern, output_file):
    # Use glob to find all CSV files matching the input pattern
    csv_files = glob.glob(input_pattern)

    # Initialize an empty DataFrame for the combined data
    combined_df = pd.DataFrame()

    # Iterate over the list of CSV files
    for file in csv_files:
        # Read the current CSV file into a DataFrame, assuming the first row includes headers
        df = pd.read_csv(file)

        # If the DataFrame isn't empty and contains expected columns, proceed
        if not df.empty and "Issue Name" in df.columns and "URLs" in df.columns:
            # Ensure "URLs" is treated as numeric, converting non-numeric values to NaN (which will be ignored in sum)
            df["URLs"] = pd.to_numeric(df["URLs"], errors='coerce')

            # Append the DataFrame from the current file to the combined DataFrame
            combined_df = combined_df.append(df, ignore_index=True)

    # If combined_df is not empty, proceed with grouping and summing
    if not combined_df.empty:
        # Group the combined DataFrame by 'Issue Name' and sum the 'URLs' values
        result_df = combined_df.groupby("Issue Name", as_index=False)["URLs"].sum()

        # Write the result DataFrame to a new CSV file
        result_df.to_csv(output_file, index=False)
        print(f"Combined and summed CSV has been saved to {output_file}")
    else:
        print("No data was found to combine and sum.")

# Example usage
input_pattern = '/Users/tis/Dendron/notes/SEOTieto/CSV exports/overview/*.csv'  # Update this path to match your CSV files location
output_file = 'combined_output.csv'
merge_and_sum_csv_files(input_pattern, output_file)


No data was found to combine and sum.


In [27]:
import pandas as pd
import glob

def merge_and_sum_csv_files(input_pattern, output_file):
    csv_files = glob.glob(input_pattern)
    dfs = []  # List to store individual DataFrames

    for file in csv_files:
        df = pd.read_csv(file)
        if not df.empty and "Issue Name" in df.columns and "URLs" in df.columns:
            df["URLs"] = pd.to_numeric(df["URLs"], errors='coerce')
            dfs.append(df)  # Append DataFrame to the list

    if dfs:  # Check if the list is not empty
        combined_df = pd.concat(dfs, ignore_index=True)  # Concatenate all DataFrames in the list

        # Group by 'Issue Name' and sum 'URLs', then reset index
        result_df = combined_df.groupby("Issue Name", as_index=False)["URLs"].sum()

        # Write the result DataFrame to a new CSV file
        result_df.to_csv(output_file, index=False)
        print(f"Combined and summed CSV has been saved to {output_file}")
    else:
        print("No data was found to combine and sum.")

# Example usage
input_pattern = '/Users/tis/Dendron/notes/SEOTieto/CSV exports/overview/*.csv'  # Update this path
output_file = 'combined_output.csv'
merge_and_sum_csv_files(input_pattern, output_file)


Combined and summed CSV has been saved to combined_output.csv


In [28]:
import pandas as pd
import glob

def merge_csv_files(input_folder, output_file):
    # Construct the pattern to match all CSV files in the folder
    input_pattern = f"{input_folder}/*.csv"
    csv_files = glob.glob(input_pattern)
    all_dfs = []  # List to hold data from each CSV file

    for file in csv_files:
        df = pd.read_csv(file)
        all_dfs.append(df)  # Append the DataFrame to the list

    # Concatenate all DataFrames in the list
    combined_df = pd.concat(all_dfs, ignore_index=True)

    # Write the combined DataFrame to a new CSV file
    combined_df.to_csv(output_file, index=False)
    print(f"All CSV files have been merged into {output_file}")

# Example usage
input_folder = '/Users/tis/Dendron/notes/SEOTieto/CSV exports/internal all'  # Folder containing the CSV files
output_file = '/Users/tis/Dendron/notes/SEOTieto/CSV exports/merged_internal_all.csv'  # Desired output file path
merge_csv_files(input_folder, output_file)


All CSV files have been merged into /Users/tis/Dendron/notes/SEOTieto/CSV exports/merged_internal_all.csv
