In [1]:
import copy
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from plotly.offline import iplot
from plotly.subplots import make_subplots
from datetime import datetime
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from scipy.stats import pearsonr, spearmanr
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import Parallel, delayed
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [2]:
df = pd.read_csv('LeedsHousePrices1996-2015 copy.csv', encoding='latin-1', dtype=object)

In [3]:
df.isnull().sum()

ID                       0
PRICE                    0
DATE OF TRANSFER         0
POSTCODE               220
PROPERTY TYPE            0
OLD/NEW                  0
DURATION                 0
PAON                   483
SAON                254392
STREET                 137
LOCALITY             59279
TOWN/CITY                0
DISTRICT                 0
COUNTY                   0
STATUS                   0
dtype: int64

In [4]:
df.drop(columns = "ID", axis =1, inplace = True)
df.drop(columns = "PAON", axis =1, inplace = True)
df.drop(columns = "SAON", axis =1, inplace = True)
df.drop(columns = "STATUS", axis =1, inplace = True)
df.drop(columns = "STREET", axis =1, inplace = True)
df.drop(columns = "POSTCODE", axis =1, inplace = True)
df.drop(columns = "LOCALITY", axis =1, inplace = True)

In [5]:
df['DATE OF TRANSFER'] = pd.to_datetime(df['DATE OF TRANSFER'], format='%d/%m/%Y')

# Extract year and month:
df['Year'] = df['DATE OF TRANSFER'].dt.year
df['Month'] = df['DATE OF TRANSFER'].dt.month

In [6]:
df.count()

PRICE               273692
DATE OF TRANSFER    273692
PROPERTY TYPE       273692
OLD/NEW             273692
DURATION            273692
TOWN/CITY           273692
DISTRICT            273692
COUNTY              273692
Year                273692
Month               273692
dtype: int64

In [7]:
df.drop(columns = 'DATE OF TRANSFER', axis = 1, inplace = True)

In [8]:
PROPERTY_TYPE = ('D','S','F','T')

enc_df = pd.DataFrame(df, columns = ['PROPERTY TYPE'])

dum_df = pd.get_dummies(df, columns = ["PROPERTY TYPE"], prefix = ["Property_Type_"] )

df = enc_df.join(dum_df)

df.drop(columns = 'PROPERTY TYPE', axis = 1, inplace = True)

df

Unnamed: 0,PRICE,OLD/NEW,DURATION,TOWN/CITY,DISTRICT,COUNTY,Year,Month,Property_Type__D,Property_Type__F,Property_Type__S,Property_Type__T
0,"£48,950",N,F,BRADFORD,LEEDS,WEST YORKSHIRE,1995,1,0,0,1,0
1,"£95,950",Y,F,LEEDS,LEEDS,WEST YORKSHIRE,1995,1,1,0,0,0
2,"£58,000",N,F,WETHERBY,LEEDS,WEST YORKSHIRE,1995,1,0,0,1,0
3,"£83,950",Y,F,LEEDS,LEEDS,WEST YORKSHIRE,1995,1,1,0,0,0
4,"£332,500",N,F,LEEDS,LEEDS,WEST YORKSHIRE,1995,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
273687,90000,N,F,LEEDS,LEEDS,WEST YORKSHIRE,2015,8,0,0,0,1
273688,130000,N,F,LEEDS,LEEDS,WEST YORKSHIRE,2015,8,0,0,1,0
273689,220000,N,F,WETHERBY,LEEDS,WEST YORKSHIRE,2015,8,0,0,1,0
273690,174000,N,F,LEEDS,LEEDS,WEST YORKSHIRE,2015,8,0,0,0,1


In [9]:
encoder = LabelEncoder()

encoder.fit(df['OLD/NEW'])

df['OLD/NEW'] = encoder.transform(df['OLD/NEW'])

In [10]:
encoder = LabelEncoder()

encoder.fit(df['DURATION'])

df['DURATION'] = encoder.transform(df['DURATION'])

In [11]:
#Apply Categorical encoding
df['TOWN/CITY'] = df['TOWN/CITY'].factorize()[0].astype('float32')

In [12]:
if not pd.api.types.is_numeric_dtype(df['PRICE']):

    df['PRICE'] = df['PRICE'].str.replace('£', '').str.replace(',', '')
    df['PRICE'] = pd.to_numeric(df['PRICE'])

In [13]:
df.drop(columns = "DISTRICT", axis =1, inplace = True)
df.drop(columns = "COUNTY", axis =1, inplace = True)

In [14]:
log_transform_price = np.log(df["PRICE"])

df['PRICE'] = log_transform_price

In [15]:
df.head()

Unnamed: 0,PRICE,OLD/NEW,DURATION,TOWN/CITY,Year,Month,Property_Type__D,Property_Type__F,Property_Type__S,Property_Type__T
0,10.798555,0,0,0.0,1995,1,0,0,1,0
1,11.471583,1,0,1.0,1995,1,1,0,0,0
2,10.968198,0,0,2.0,1995,1,0,0,1,0
3,11.337977,1,0,1.0,1995,1,1,0,0,0
4,12.714395,0,0,1.0,1995,1,1,0,0,0


In [16]:
X = df
y = df['PRICE']              # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Save the training data
X_train.to_csv('trainn_data.csv', index=False)

# Save the test data
X_test.to_csv('testt_data.csv', index=False)

print("Training and test data saved successfully!")

Training and test data saved successfully!
