# Lost and Found

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
all_data = []

for i in range(1, 131):  # Loop through 130 pages
    base_url = f"https://mpk.krakow.pl/pl/biuro-rzeczy-znalezionych/lista/index,{i}.html"
    page = requests.get(base_url)
    
    if page.status_code != 200:
        print(f"Failed to fetch page {i}. Status code: {page.status_code}")
        continue
    
    soup = BeautifulSoup(page.text, "html.parser")  

    # Extract table headers
    lost_and_found_titles = soup.find_all("th")
    lost_and_found_table_titles = [title.text.strip() for title in lost_and_found_titles]

    # Initialize DataFrame for the current page
    df = pd.DataFrame(columns=lost_and_found_table_titles)

    # Locate the table
    table = soup.find("table", class_="przetarg-tab")
    if not table:
        print(f"No table found on page {i}. Skipping.")
        continue

    # Extract rows
    column_data = table.find_all("tr")  # Includes headers
    for row in column_data[1:]:  # Skip the header row
        row_data = row.find_all("td")
        individual_row_data = [data.text.strip() for data in row_data]
        all_data.append(individual_row_data)  # Append raw data for all pages

        # Add row to DataFrame
        df.loc[len(df)] = individual_row_data

    #print(f"Scraped page {i} successfully.")

# Combine all data into a single DataFrame
final_df = pd.DataFrame(all_data, columns=lost_and_found_table_titles)

# Save to CSV
final_df.to_csv("lost_and_found.csv", index=False, encoding="utf-8")
print("All data saved to lost_and_found.csv.")

In [5]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6479 entries, 0 to 6478
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Id                               6479 non-null   object
 1   Nazwa                            6479 non-null   object
 2   Numer linii/Miejsce znalezienia  6479 non-null   object
 3   Data znalezienia                 6479 non-null   object
 4   Miejsce odbioru                  6479 non-null   object
dtypes: object(5)
memory usage: 253.2+ KB


In [6]:
final_df.head()

Unnamed: 0,Id,Nazwa,Numer linii/Miejsce znalezienia,Data znalezienia,Miejsce odbioru
0,1.15557.2024,Latarka,69,2024-12-01,MPK
1,1.15558.2024,Laska inwalidzka,17,2024-12-01,MPK
2,1.15559.2024,Reklamówka z zawartością,21,2024-12-01,MPK
3,1.15560.2024,Teczka z zawartością,18,2024-12-01,MPK
4,1.15561.2024,Siatka z zawartością,194,2024-12-01,MPK
