In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# Load the dataset
url = 'https://raw.githubusercontent.com/datasets/house-prices-uk/master/data/data.csv'
df = pd.read_csv(url)
df.head(10)

Unnamed: 0,Date,Price (All),Change (All),Price (New),Change (New),Price (Modern),Change (Modern),Price (Older),Change (Older)
0,1952-11-01,1891,0.0,2107,0.0,2020,0.0,1524,0.0
1,1953-02-01,1891,0.0,2107,0.0,2002,0.0,1542,0.0
2,1953-05-01,1891,0.0,2107,0.0,2002,0.0,1542,0.0
3,1953-08-01,1881,0.0,2117,0.0,2002,0.0,1524,0.0
4,1953-11-01,1872,-1.0,2117,0.5,1975,-2.2,1542,1.2
5,1954-02-01,1863,-1.5,2117,0.5,1957,-2.2,1524,-1.2
6,1954-05-01,1872,-1.0,2117,0.5,1984,-0.9,1515,-1.7
7,1954-08-01,1863,-1.0,2127,0.5,1948,-2.7,1524,0.0
8,1954-11-01,1853,-1.0,2127,0.5,1939,-1.8,1515,-1.7
9,1955-02-01,1900,2.0,2167,2.4,1984,1.4,1569,2.9


In [12]:
df.columns

Index(['Date', 'Price (All)', 'Change (All)', 'Price (New)', 'Change (New)',
       'Price (Modern)', 'Change (Modern)', 'Price (Older)', 'Change (Older)'],
      dtype='object')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             261 non-null    object 
 1   Price (All)      261 non-null    int64  
 2   Change (All)     261 non-null    float64
 3   Price (New)      261 non-null    int64  
 4   Change (New)     261 non-null    float64
 5   Price (Modern)   261 non-null    int64  
 6   Change (Modern)  261 non-null    float64
 7   Price (Older)    261 non-null    int64  
 8   Change (Older)   261 non-null    float64
dtypes: float64(4), int64(4), object(1)
memory usage: 18.5+ KB


In [15]:
# Feature engineering
# Extract year and month from the 'Date' column
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month

In [16]:
# Feature scaling
scaler = StandardScaler()
df[['Price (All)', 'Change (All)', 'Price (New)', 'Change (New)',
    'Price (Modern)', 'Change (Modern)', 'Price (Older)', 'Change (Older)',
    'year', 'month']] = scaler.fit_transform(df[['Price (All)', 'Change (All)', 'Price (New)', 'Change (New)',
                                                 'Price (Modern)', 'Change (Modern)', 'Price (Older)', 'Change (Older)',
                                                 'year', 'month']])


In [17]:
# Dimensionality reduction
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df[['Price (All)', 'Change (All)', 'Price (New)', 'Change (New)',
                                   'Price (Modern)', 'Change (Modern)', 'Price (Older)', 'Change (Older)',
                                   'year', 'month']])


In [18]:
# Add PCA components to the dataframe
df['pca1'] = pca_result[:, 0]
df['pca2'] = pca_result[:, 1]


In [19]:
# Data splitting
# Split the data into training and testing sets
X = df.drop(['Date', 'Price (All)', 'Change (All)', 'Price (New)', 'Change (New)',
             'Price (Modern)', 'Change (Modern)', 'Price (Older)', 'Change (Older)',
             'year', 'month'], axis=1)
y = df['Price (All)']  # Assuming 'Price (All)' is the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# Model training
# Train a random forest regressor to predict house prices
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [21]:
# Model evaluation
# Predict house prices on the test set
y_pred = rf_model.predict(X_test)


In [22]:
# Calculate mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 0.011726343732217056
R-squared: 0.985976452955882
