# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam
from keras.losses import MeanAbsoluteError
from keras.callbacks import EarlyStopping
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error

%matplotlib inline




In [None]:
data = pd.read_csv('nigeria_houses_data.csv')

# Data Cleaning and Preprocessing

In [None]:
#Displaying summary statistics of the dataset
data.describe()
data.info()

In [None]:
# Checking for missing values in the dataset using isnull() and sum()

data.isnull().sum()

In [None]:
data.head()

In [None]:
# Converting specific columns to the 'int64' data type

data[['bedrooms', 'bathrooms', 'toilets', 'parking_space', 'price']] = data[['bedrooms', 'bathrooms', 'toilets', 
                                                                             'parking_space', 'price']].astype('int64')

In [None]:
data.head()

In [None]:
# Retrieve the unique values in the 'title' column of the DataFrame

data.title.unique()

In [None]:
# Retrieve the unique values in the 'title' column of the DataFrame

data.town.unique().size

In [None]:
data.shape

In [None]:
#Extracting Lagos dataset
df = data[(data['state'] =='Lagos')]

#creating a list of the categorical features
cat_cols = [col for col in df.columns if df[col].dtype == 'object']

In [None]:
cat_cols

In [None]:
#use onehotencoding to preprocess and encode the cat features
df_1= pd.get_dummies(df, columns = cat_cols)

In [None]:
df_1.shape

In [None]:
df_1.head()

In [None]:
#Defining the features and target variables

target = df_1.price
features = df_1.drop(['price', 'state_Lagos'], axis=1)
features.shape

In [None]:
# Splittng the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

In [None]:
#creating the validation sets

X_val = X_train.sample(frac=0.2, random_state=0)
y_val = y_train.sample(frac=0.2, random_state=0)

In [None]:

# Create a StandardScaler for the input features (X)
scaler_X = StandardScaler()

# Fit and transform the training set
X_train_scaled = scaler_X.fit_transform(X_train)

# Transform the testing and validation sets using the same scaler
X_test_scaled = scaler_X.transform(X_test)
X_val_scaled = scaler_X.transform(X_val)

# Creating a StandardScaler for the target variable
scaler_y = StandardScaler()
# Fit and transform the training set target variable
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()

# Transform the testing and validation sets target variable using the same scaler
y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1)).ravel()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).ravel()

# Visualization

In [None]:
# Sorting the data by 'price' in descending order and select the top 10
data_sorted = data.sort_values(by='price', ascending=False).head(10)

# Extracting 'town' and 'price' columns
town= data_sorted['town']
price = data_sorted['price']

# Creating a horizontal bar chart showing the most expensive houses in lagos and their locations
plt.barh(town, price)
plt.xlabel('Price')
plt.ylabel('Town')
plt.title('Most Expensive Houses in Lagos')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Setting Seaborn style to whitegrid
sns.set(style='whitegrid')

# Creating a scatter plot using Seaborn
plt.figure(figsize=(18, 8))
sns.scatterplot(x= 'title', y ='price', hue='bedrooms', data =data, palette='inferno', s=100)
plt.title('The relationship between type of house, Nr. of bedrooms and the price')
plt.show()

In [None]:
sns.set(style='whitegrid')
plt.figure(figsize=(18, 8))
plt.subplot(1, 2, 1)
sns.scatterplot(x= 'bedrooms', y ='price', data =data, palette='inferno', s=100)
plt.title('The relationship between Nr. of bedrooms and the price')

plt.subplot(1, 2, 2)
sns.scatterplot(x= 'bathrooms', y ='price', data =data, palette='inferno', s=100)
plt.title('The relationship between Nr. of bathrooms and the price')
plt.show()

In [None]:
sns.set(style='whitegrid')
plt.figure(figsize=(18, 8))
plt.subplot(1, 2, 1)
sns.scatterplot(x= 'toilets', y ='price', data =data, palette='inferno', s=100)
plt.title('The relationship between Nr. of toilets and the price')

plt.subplot(1, 2, 2)
sns.scatterplot(x= 'parking_space', y ='price', data =data, palette='inferno', s=100)
plt.title('The relationship between Nr. of parking spaces and the price')
plt.show()

# Summary of the Visualization

Looking at the plots the price of houses in Lagos state doesn't seem to be affected by the number of bedrooms, bathrooms, toilets, or parking spaces, but mostly by the location.

# Training Model 1

In [None]:
#Defining the model
model = Sequential([
    Dense(units= 128, input_shape= (X_train_scaled.shape[1],), activation='relu'),
    Dense(units=64, activation='relu'),
    Dense(units=16, activation='relu'),
    Dense(1, activation='linear')
])

In [None]:
#Compiling the model
optimizer= Adam(learning_rate= 0.001)
model.compile(optimizer=optimizer, loss=MeanAbsoluteError())

In [None]:
#fitting/training the model
model_history = model.fit(X_train_scaled, y_train_scaled, batch_size=56, epochs=250, 
                          verbose=0, validation_data=(X_val_scaled, y_val_scaled))

In [None]:
#Visualizing the training and validation losses

model_history_df = pd.DataFrame(model_history.history)


plt.figure(figsize=(12, 6))

# Plot training & validation loss values

plt.plot(model_history_df['loss'], label='Train')
plt.plot(model_history_df['val_loss'], label='Validation')
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
#making predictions with the model
y_pred = model.predict(X_test_scaled)

In [None]:
#unscaling the predicted values
y_pred_unscaled = scaler_y.inverse_transform(y_pred).flatten()

In [None]:
y_pred_unscaled.astype('int64')

In [None]:
y_test[0:3]

# Model Evaluation

In [None]:
#calculating the coefficient of determination using r2_score
eval_r2 = r2_score(y_test, y_pred_unscaled)
eval_r2

# Model 2

In [None]:
X = df.drop(['price', 'state'], axis = 1)
y = df['price']

In [None]:
#creating the training and testing sets

X_train_2, X_test_all, y_train_2, y_test_all = train_test_split(X, y, test_size=0.2, random_state=42)

#Creating the validation sets

X_test_2, X_val_2, y_test_2, y_val_2 = train_test_split(X_test_all, y_test_all, test_size=0.5, random_state=42)

In [None]:

print(X.dtypes)
print(X_train_2[])

In [None]:
#creating the nunerical and categorical features

num_features = X.select_dtypes(include=['int64']).columns
cat_features = X.select_dtypes(include=['object']).columns

In [None]:
#Defining the preprocessing steps

numerical_transformer = StandardScaler(with_mean=False)

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('scaler', StandardScaler(with_mean=False))
])

In [None]:
#Creating the preprocessor, combining the transformation steps for numerical and cat features

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [None]:
#defining the model using a pipepline and randomforest regressor

model_2 = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=250,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='1.0',
        random_state=42))
])

In [None]:
#fitting the model

model_2.fit(X_train_2, y_train_2)

In [None]:
#scaling the target feature
scale_y_val = StandardScaler(with_mean=False)
y_val_scl = scale_y_val.fit_transform(y_val_2.values.reshape(-1, 1)).ravel()



In [None]:
def predict_prices(inputs):
    result = model_2.predict(inputs)
    return scale_y_val.inverse_transform(result.reshape(-1, 1))

#making predictions
model_2_preds = predict_prices(X_val_2)
model_2_preds

In [None]:
y_val_2

# Evaluating Model 2

In [None]:
#calculating the mean squared error
mse_val =mean_squared_error(y_val_scl, model_2_preds)

In [None]:
mse_val