In [10]:
import pandas as pd
import os
import re

# Load the data from the correct directory
os.chdir(r"C:\Users\phili\Projects\immo-eliza-scraping-FireFlies\Data\Raw") # raw strings for syntax / input box for external user?
data = pd.read_csv("immoweb_raw.csv")

''' No blank spaces, no empty values, no wrongly encoded values (e.g. a text value in the price column)'''
# No blank spaces using strip() for strings data only
types = data.dtypes
for col in data.columns:
    if types[col] == 'string': # Apply str.strip() only to string columns
        data[col] = data[col].str.strip()
# Fill missing values with "None"
data = data.fillna("None")
# Filter out rows where the 'Price' column contains 'None' (2.744)
data = data[data['Price'] != 'None']

# Save the cleaned data
os.chdir(r"C:\Users\phili\Projects\immo-eliza-scraping-FireFlies\Data\Raw\Clean")
data.to_csv("immoweb_cleaned.csv", index=False)

In [None]:
# Exclude the first column from duplicate check
columns_to_check = data.columns[1:]

# Find duplicates in rows except the first column
duplicates = data.duplicated(subset=columns_to_check, keep='first')

# Filter the dataset to show only the duplicated rows (excluding the first occurrence)
duplicated_rows = data[duplicates]
duplicated_rows_count = duplicated_rows.count()
print("Number of duplicated rows found:", duplicated_rows_count)

# Remove duplicate rows based on specified columns
data = data.drop_duplicates(subset=columns_to_check, keep='first')

# Display the cleaned dataset
print("Cleaned Dataset (excluding duplicate rows):")
data.count()

In [None]:
filled_rows = data[data.notna().all(axis=1)]

# Print the rows
print(filled_rows)
filled_rows.count()

In [None]:
data = data.drop(['Closed Box Parking Spaces'], axis=1)

Data Analysis


In [None]:

# Save the cleaned dataset
data.to_csv("immoweb_cleaned1.csv", index=False)
data

Data Analysis


In [None]:
print(data.describe())  # Summary statistics
print(data.info())      # Information about the dataset

Convert to int

In [None]:
columns_to_convert = ['Price','Construction Year', 'Bedroom Count', 'Terrace Surface','Garden Surface', 'Land Surface', 'Facade Count']

# Convert specified columns to numeric, coercing errors to NaN
data[columns_to_convert] = data[columns_to_convert].apply(pd.to_numeric, errors='coerce')


placeholder_value = 0  
data['Facade Count'] = data['Facade Count'].fillna(placeholder_value)

# Convert columns to integer or float as needed
data['Construction Year'] = data['Construction Year'].astype(float)
data['Bedroom Count'] = data['Bedroom Count'].astype(int)



In [None]:
data['Facade Count']

In [None]:
print(data.info())      # Information about the dataset


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.pairplot(data[['Price', 'Bedroom_Count', 'Land_Surface', 'Habitable_Surface']])
plt.show()

# Example: Boxplot to visualize the distribution of 'Price' for different 'Type' of properties
plt.figure(figsize=(10, 6))
sns.boxplot(x='Type', y='Price', data=data)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Select only numeric columns for the heatmap
numeric_data = data.select_dtypes(include='number')

# Calculate the correlation matrix
correlation_matrix = numeric_data.corr()

# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numeric Features')
plt.show()