In [20]:
## Importing libraries
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [21]:
## Data Cleaning
sample_data = pd.read_csv('../SODAS2023-2/EXAM/data/cleaned_data.csv',encoding='utf-16')
df = pd.DataFrame(sample_data)

# Change constructing year to construction age and squared term for OLS.
df['age'] = 2023 - df['year']
df['age_squared'] = df['age'] ** 2

In [27]:
# Convert "area_code" column to string type
df['area_code'] = df['area_code'].astype(str)

# Step 1: Extract the first digit from "area_code" and create a new column
df['first_digit'] = df['area_code'].str[0]

# Step 2: Create dummy variables for the extracted digit
dummies = pd.get_dummies(df['first_digit'], prefix='area_digit')

# Step 3: Concatenate the dummy variables to the DataFrame
df = pd.concat([df, dummies], axis=1)

# Step 4: Drop the original "area_code" and "first_digit" columns
df.drop(['area_code', 'first_digit'], axis=1, inplace=True)



In [28]:
df


Unnamed: 0,price,address,city,type,saledays,energy,living_space,ground_space,rooms,owner_expenses,...,age_squared,area_digit_1,area_digit_2,area_digit_3,area_digit_4,area_digit_5,area_digit_6,area_digit_7,area_digit_8,area_digit_9
0,1275000,Strandparken 46,Vest- og Sydsjælland,Holiday home,202.0,Energimærke D,170.0,1019.0,2.0,2.598,...,2025.0,0,0,0,1,0,0,0,0,0
1,5095000,"Kalkbrænderihavnsgade 4A, 1. tv.",Byen København,Apartment,0.0,Energimærke A,94.0,0.0,3.0,2.389,...,36.0,0,1,0,0,0,0,0,0,0
2,14750000,Niels Andersens Vej 56,Københavns omegn,Villa,255.0,Energimærke E,248.0,984.0,8.0,8.643,...,6724.0,0,1,0,0,0,0,0,0,0
3,4498000,"Helga Pedersens Gade 1, 2. 3.",Østjylland,Apartment,7.0,Energimærke B,107.0,0.0,3.0,3.849,...,81.0,0,0,0,0,0,0,0,1,0
4,2850000,Nøddevænget 20,Sydjylland,Villa,0.0,Energimærke C,163.0,858.0,5.0,3.343,...,3364.0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3645,1750000,"Prins Haralds Allé 101, 1.",Fyn,Villa,48.0,Energimærke G,109.0,506.0,3.0,1.922,...,7744.0,0,0,0,0,1,0,0,0,0
3646,990000,Nissumvej 4,Vestjylland,Holiday home,48.0,Energimærke C,117.0,800.0,3.0,1.566,...,6561.0,0,0,0,0,0,0,1,0,0
3647,1695000,Baunetoften 12,Nordsjælland,Terraced house,240.0,Energimærke C,86.0,103.0,4.0,2.184,...,1089.0,0,0,1,0,0,0,0,0,0
3648,650000,Aalevej 40,Østjylland,Villa,48.0,Energimærke D,79.0,829.0,3.0,1.464,...,4624.0,0,0,0,0,0,0,1,0,0


In [29]:
## Dataset split
# X and y
columns_to_drop = ['price','address','city','type','energy', 'year','area_name']
X,y = df.drop(columns_to_drop,axis=1),df.price
test_size = 0.30 # We have a relatively small dataset
seed = 17082023 

# SPLIT INTO DEVELOPMENT (2/3) AND TEST DATA (1/3)
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

# SPLIT DEVELOPMENT INTO TRAIN (1/3) AND VALIDATION (1/3)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=test_size, random_state=seed)



In [30]:
# Define the degrees you want to try
degrees = [1, 2, 3]

# Initialize a dictionary to store the pipelines and results
pipeline_results = {}

for degree in degrees:
    # Create the pipeline for each degree
    pipe_ols = make_pipeline(
        PolynomialFeatures(degree=degree, include_bias=False),
        StandardScaler(),
        LinearRegression()
    )

    # Fit the pipeline on the training data
    pipe_ols.fit(X_train, y_train)

    # Make predictions on the validation data
    y_val_pred = pipe_ols.predict(X_val)

    # Calculate the RMSE on the validation data
    rmse_val = mse(y_val, y_val_pred, squared=False)

    # Store the pipeline and RMSE in the dictionary
    pipeline_results[degree] = {'pipeline': pipe_ols, 'rmse_val': rmse_val}

# Print the RMSE results for different degrees
for degree, result in pipeline_results.items():
    print(f"Degree {degree}: RMSE on validation set = {result['rmse_val']}")

Degree 1: RMSE on validation set = 2368467.6377264014
Degree 2: RMSE on validation set = 3.8195598451406003e+18
Degree 3: RMSE on validation set = 6.082050077611906e+20


In [31]:
# Define the degrees you want to try
degrees = [1, 2, 3]

# Initialize a dictionary to store the cross-validation results
cross_val_results = {}

for degree in degrees:
    # Create the pipeline for each degree
    pipe_ols = make_pipeline(
        PolynomialFeatures(degree=degree, include_bias=False),
        StandardScaler(),
        LinearRegression()
    )

    # Perform cross-validation and get the scores
    scores = cross_val_score(pipe_ols, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    
    # Convert negative MSE scores to positive RMSE scores
    rmse_scores = np.sqrt(-scores)
    
    # Calculate the average RMSE score
    avg_rmse = np.mean(rmse_scores)
    
    # Store the cross-validation results
    cross_val_results[degree] = {'pipeline': pipe_ols, 'avg_rmse': avg_rmse}

# Print the cross-validation results for different degrees
for degree, result in cross_val_results.items():
    print(f"Degree {degree}: Average RMSE from cross-validation = {result['avg_rmse']}")

Degree 1: Average RMSE from cross-validation = 2753874.0461314013
Degree 2: Average RMSE from cross-validation = 2.0041895389166072e+18
Degree 3: Average RMSE from cross-validation = 8.04866680598042e+20


In [25]:
# Define the degrees you want to try
degrees = [1, 2, 3]

# Initialize a dictionary to store the cross-validation results
cross_val_results = {}

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=seed)  # You can adjust the number of splits

for degree in degrees:
    # Create the pipeline for each degree
    pipe_ols = make_pipeline(
        PolynomialFeatures(degree=degree, include_bias=False),
        StandardScaler(),
        LinearRegression()
    )

    # Perform cross-validation and get the scores
    scores = cross_val_score(pipe_ols, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    
    # Convert negative MSE scores to positive RMSE scores
    rmse_scores = np.sqrt(-scores)
    
    # Calculate the average RMSE score
    avg_rmse = np.mean(rmse_scores)
    
    # Store the cross-validation results
    cross_val_results[degree] = {'pipeline': pipe_ols, 'avg_rmse': avg_rmse}

# Print the cross-validation results for different degrees
for degree, result in cross_val_results.items():
    print(f"Degree {degree}: Average RMSE from cross-validation = {result['avg_rmse']}")

Degree 1: Average RMSE from cross-validation = 2969123.5413702903
Degree 2: Average RMSE from cross-validation = 3.6996692657417736e+16
Degree 3: Average RMSE from cross-validation = 1.8950484719779638e+21
