## Reading and Understanding the Data ##

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Define the labels for the apartments_rent_pl_2024_06 datafile
col_names = ["id", "city", "type", "square_meters", "rooms", "floor", "floor_count", "build_year", "latitude", "longitude", "centre_dist", "poi_count", "school_dist", "clinic_dist", "post_o_dist", "kindergarten_dist", "restaurant_dist", "college_dist", "pharmacy_dist", "ownership", "building_material", "condition", "has_parking", "has_balcony", "has_elevator", "has_security", "has_storage", "price"]

# load dataset
data = pd.read_csv("apartments_rent_pl_2024_06.csv", header = 0, names = col_names)

 Let’s take a quick look at the dataframe description.

Data Fields

•  city - the name of the city where the property is located

•  type - type of the building

•  squareMeters - the size of the apartment in square meters

•  rooms - number of rooms in the apartment

•  floor / floorCount - the floor where the apartment is located and the total number of floors in the building

•  buildYear - the year when the building was built

•  latitude, longitude - geo coordinate of the property

•  centreDistance - distance from the city centre in km

• poiCount - number of points of interest in 500m range from the apartment (schools, clinics, post offices, kindergartens, restaurants, colleges,
   pharmacies)

• [poiName]Distance - distance to the nearest point of interest (schools, clinics, post offices, kindergartens, restaurants, colleges, pharmacies)

•  ownership - the type of property ownership

•  condition - the condition of the apartment

• has[features] - whether the property has key features such as assigned parking space, balcony, elevator, security, storage room

•  price - offer price in Polish Zloty

In [None]:
# Check the head of the dataset
data.head()

In [None]:
data_new = data[data["city"] == "warszawa"]

## Data Inspection ##

In [None]:
data_new.shape

In [None]:
data_new.info()

I got 8849 rows with 28 features. 

In [None]:
data_new.describe()

The one thing that’s worth noticing might be that 75% of the data has a price of 4000 PLN, when the maximum is 19500 PLN. That might mean there are some outliers in the dataset.

In [None]:
data_new.describe(include=object)

In [None]:
#data= data.rename(columns={"squareMeters": "square_meters"})

In [None]:
#data= data.rename(columns={"floorCount": "floor_count"})

In [None]:
#data= data.rename(columns={"buildYear": "build_year"})

In [None]:
#data= data.rename(columns={"centreDistance": "centre_dist"})

In [None]:
#data= data.rename(columns={"centreDistance": "centre_distance"})

In [None]:
#data= data.rename(columns={"poiCount": "poi_count"})

In [None]:
#data= data.rename(columns={"schoolDistance": "school_dist"})

In [None]:
#data= data.rename(columns={"clinicDistance": "clinic_dist"})

In [None]:
#data= data.rename(columns={"postOfficeDistance": "post_office_dist"})

In [None]:
#data= data.rename(columns={"kindergartenDistance": "kindergarten_dist"})

In [None]:
#data= data.rename(columns={"restaurantDistance": "restaurant_dist"})

In [None]:
#data= data.rename(columns={"collegeDistance": "college_dist"})

In [None]:
#data= data.rename(columns={"pharmacyDistance": "pharmacy_dist"})

In [None]:
#data= data.rename(columns={"buildingMaterial": "building_material"})

In [None]:
#data= data.rename(columns={"hasParkingSpace": "has_parking_space"})

In [None]:
#data= data.rename(columns={"hasBalcony": "has_balcony"})

In [None]:
#data= data.rename(columns={"hasElevator": "has_elevator"})

In [None]:
#data= data.rename(columns={"hasSecurity": "has_security"})

In [None]:
#data= data.rename(columns={"hasStorageRoom": "has_storage_room"})

## Data Cleaning ##

In [None]:
# Check for missing values
missing_counts=data_new.isnull().sum()
print("Missing values in each column:")
print(missing_counts)

Remove variables that contain more than 20% missing values.

In [None]:
data_new.drop([ "type", "building_material", "condition"],axis=1, inplace=True)

In [None]:
# Replace missing categorical values with mode
categorical_cols = data_new.select_dtypes(include=[object]).columns
for col in categorical_cols:
    mode_value = data_new[col].mode()[0]
    data_new[col].fillna(mode_value, inplace=True)

In [None]:
# Replace missing numeric values with median
numeric_cols = data_new.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    median_value = data_new[col].median()
    data_new[col].fillna(median_value, inplace=True)

In [None]:
missing_counts=data_new.isnull().sum()
print("Missing values in each column:")
print(missing_counts)

In [None]:
data_new.drop(["id", "city"],axis=1, inplace=True)

I removed the "id" column from the dataset since it doesn't contribute valuable information for building machine learning models. This column is a unique identifier and does not help in predicting outcomes or understanding patterns in the data.

## Handling categorical data ##

There are categorical variables, so I need to encode these as numbers.

In [None]:
# Encoding categorical variables
le = LabelEncoder()

In [None]:
for column in data_new.select_dtypes(include = object).columns.tolist():
    data_new[column] = le.fit_transform(data_new[column])

In [None]:
data_new.info()

## Visualising Variables ##

In [None]:
top_features = data_new.corr()[['price']].sort_values(by=['price'],ascending=False).head(30)
plt.figure(figsize=(5,10))
sns.heatmap(top_features,cmap='rainbow',annot=True,annot_kws={"size": 16},vmin=-1)

In [None]:
sns.scatterplot(x='square_meters', y='price', data=data)

In [None]:
sns.scatterplot(x='rooms', y='price', data=data)

In [None]:
#data_new.drop(columns=["longitude", "floor_count", "build_year", "centre_dist", "clinic_dist", "post_o_dist", "restaurant_dist", "college_dist", "has_storage", "ownership"], inplace = True)

In [None]:
data_new.head()

## Target variable ##

I’m going to train the model to predict the house rental price based on the above features...

In [None]:
# Split features and target variabl
X = data_new.drop(columns=["price"])  # Replace 'target_column' with actual name
y = data_new['price']

In [None]:
# split Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, random_state=42)

In [None]:
print("x_train shape: ", X_train.shape)
print("x_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

In [None]:
scaler = MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

In [None]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

In [None]:
# Models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [None]:
# Store results
results_1 = []

# Train and evaluate models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Append results to the list
    results_1.append([name, mae, mse, rmse, r2])

# Create a DataFrame to display the results
results_1_data = pd.DataFrame(results_1, columns=["Model", "MAE", "MSE", "RMSE", "R2 Score"])
print(results_1_data)

In [None]:
# split Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.15, random_state=42)

In [None]:
print("x_train shape: ", X_train.shape)
print("x_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

In [None]:
scaler = MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

In [None]:
# Models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [None]:
# Store results
results_2 = []

# Train and evaluate models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Append results to the list
    results_2.append([name, mae, mse, rmse, r2])

# Create a DataFrame to display the results
results_2_data = pd.DataFrame(results_2, columns=["Model", "MAE", "MSE", "RMSE", "R2 Score"])
print(results_2_data)

In [None]:
# split Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.25, random_state=42)

In [None]:
print("x_train shape: ", X_train.shape)
print("x_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

In [None]:
scaler = MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

In [None]:
# Models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)}

In [None]:
# Store results
results_3 = []

# Train and evaluate models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Append results to the list
    results_3.append([name, mae, mse, rmse, r2])

# Create a DataFrame to display the results
results_3_data = pd.DataFrame(results_3, columns=["Model", "MAE", "MSE", "RMSE", "R2 Score"])
print(results_3_data)

In [None]:
# Store results
results_3 = []

# Train and evaluate models using cross-validation for R² Score
for name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')  # Using R² for scoring
    mean_cv_score = cv_scores.mean()  # Mean R² score
    
    # Train the model on the full dataset for predictions
    model.fit(X, y)
    
    # Store results
    results_3.append([name, mean_cv_score])

# Create a DataFrame to display the results
results_3_data = pd.DataFrame(results_3, columns=["Model", "Mean CV R² Score"])
print("\nModel Performance Metrics using Cross-Validation for R² Score:")
print(results_3_data)

In [None]:
# Create a DataFrame to display the results
results_3_data = pd.DataFrame(results_3, columns=["Model", "Mean CV R² Score"])
print("\nModel Performance Metrics using Cross-Validation for R² Score:")
print(results_3_data)

# To demonstrate predictions and plot results, let's still split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Predict on the test set for final evaluation
predictions = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[name] = y_pred

# Plotting actual vs predicted for each model vertically with smaller size
plt.figure(figsize=(6, 10))  # Adjusted size for a more compact vertical layout

for i, (name, y_pred) in enumerate(predictions.items()):
    plt.subplot(3, 1, i + 1)  # 3 rows, 1 column
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Diagonal line
    plt.title(f'{name} Predictions')
    plt.xlabel('Actual price')
    plt.ylabel('Predicted price')
    plt.xlim(y_test.min(), y_test.max())
    plt.ylim(y_test.min(), y_test.max())

plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import Lasso  # Lasso for regularization
from sklearn.model_selection import GridSearchCV

# Define features and target variable
X = data_new.drop(columns='price')  # Features (age, sex, bmi, etc.)
y = data_new['price']  # Target (insurance charges)

# Set up parameter grids for each model
param_grid = {
    'Linear Regression': {'alpha': [0.01, 0.1, 1.0, 10.0]},  # Lasso regularization parameter
    'Random Forest': {'n_estimators': [50, 100, 200]}  # Number of trees in the forest
}

# Models
models = {
    "Linear Regression": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42)
}

# Store results
results = []

# Grid Search for hyperparameter tuning
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='r2')  # 5-fold CV for R²
    grid_search.fit(X, y)
    
    # Best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Store results
    results.append([name, best_params, best_score])

# Create a DataFrame to display the results
results_df = pd.DataFrame(results, columns=["Model", "Best Parameters", "Best CV R² Score"])
print("\nBest Hyperparameters and Scores from Grid Search:")
print(results_df)