### Part 1

In [None]:
# 3.	Building a Linear Regression Model to Predict Real Estate Prices

# a.	Import Pandas and NumPy for data manipulation and numerical operations.
import pandas as pd
import numpy as np

# b.	Import Matplotlib and Seaborn for creating statistical and graphical visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from altair import X2Datum
from matplotlib.pyplot import ylabel
from rich.jupyter import display

# c.	Read the Real_estate dataset provided into a DataFrame using Pandas. This dataset contains records of real estate price predictions based on various factors. Use realEstate as the variable name of the dataset
real_state = pd.read_csv('data/Real_estate.csv')

# d.	Adjust the house price to current prices by multiplying the price column by 2
real_state['Y house price of unit area'] = real_state['Y house price of unit area'] * 2

In [None]:
# e.	Display the first few records of the dataset to inspect its contents.
real_state.head()

In [None]:
# f.	Display the information about the realEstate dataset, including data types, non-null counts, and memory usage, to understand its structure and contents.
real_state.info()

In [None]:
# g.	Find the minimum and maximum values for each column in the realEstate dataset to identify the range of data across different variables.
real_state.max()

In [None]:
real_state.min()

In [None]:
# 4.	Preparing the Data. In this section we will be identifying and removing irrelevant columns from the Real Estate Dataset

# a.	Identify and remove the irrelevant column No from the dataset.
real_state.drop(labels='No', axis=1, inplace=True)

# b.	Display the updated information about the dataset to confirm the removal of the column.
real_state.info

### Part 2

In [None]:
# 5.	Extract the X1 transaction date column as x1 and the Y house price of unit area column as y from the dataset.
x1 = real_state['X1 transaction date']
y = real_state['Y house price of unit area']

# 6.	Create a scatter plot to visualize the relationship between x1 and y, with a line width of 1.
sns.scatterplot(x=x1, y=y)

# 7.	Copy and paste visualization and describe what conclusions could be drawn from this visualization.


In [None]:
# 8.	Extract the X2 house age column as x2 and the Y house price of unit area column as y from the dataset.
x2 = real_state['X2 house age']

# 9.	Create a scatter plot to visualize the relationship between x2 (house age) and y (house price of unit area), with a line width of 1.
sns.scatterplot(x=x2, y=y)

# 10.	Copy and paste visualization and describe what conclusions could be drawn from this visualization.


In [None]:
# 11.	Extract the X3 distance to the nearest MRT station column as x3 and the Y house price of the unit area column as y from the dataset.
x3 = real_state['X3 distance to the nearest MRT station']

# 12.	Create a scatter plot to visualize the relationship between x3 (distance to the nearest MRT station) and y (house price of unit area), with a line width of 1.
sns.scatterplot(x=x3, y=y, lw=1)

# 13.	Copy and paste visualization and describe what conclusions could be drawn from this visualization.


In [None]:
# 14.	Extract the X4 number of convenience stores column as x4 and the Y house price of unit area column as y from the dataset.
x4 = real_state['X4 number of convenience stores']

# 15.	Create a scatter plot to visualize the relationship between x4 (number of convenience stores) and y (house price of unit area), with a line width of 1.
sns.scatterplot(x=x4, y=y, lw=1)

# 16.	Copy and paste visualization and describe what conclusions could be drawn from this visualization.


In [None]:
# 17.	Extract the X5 latitude column as x5 and the Y house price of the unit area column as y from the dataset.
x5 = real_state['X5 latitude']

# 18.	Create a scatter plot to visualize the relationship between x5 (latitude) and y (house price of unit area), with a line width of 1.
sns.scatterplot(x=x5, y=y, lw=1)

# 19.	Copy and paste visualization and describe what conclusions could be drawn from this visualization.


In [None]:
# 20.	Extract the X6 longitude column as x6 and the Y house price of unit area column as y from the dataset.
x6 = real_state['X6 longitude']

# 21.	Create a scatter plot to visualize the relationship between x6 (longitude) and y (house price of unit area), with a line width of 1.
sns.scatterplot(x=x6, y=y, lw=1)

# 22.	Copy and paste visualization and describe what conclusions could be drawn from this visualization.


### Part 3 - Training a model

In [None]:
# 23.	Splitting Data into Training and Testing Sets for Model Training
# a.	Create a tuple with the dependent (y) and independent (X) variables. Remove the dependent data (our y) from our independent data (our x values) in the dataset.
X, y = real_state.drop('Y house price of unit area', axis=1), real_state['Y house price of unit area'].values

# b.	Display the values of the X (matrix) and y (vector) variables.
X


In [None]:
y

In [None]:
# 24.	Prepare the training datasets using the sklearn.model_selection module (import train_test_split function).
from sklearn.model_selection import train_test_split

# a.	Split the data into training and testing data with the train_test_split function.  Use a 30% test size for this split and a random_state of 40.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=87)

In [None]:
# a.	Train a model by creating a linear regression object using the LinearRegression class and using its fit method with the training data sets.
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
# a.	Generate predictions for the test data using the trained model (predict method, and the X_test dataset).
predictions = lm.predict(X_test)

# b.	Evaluate the model's accuracy by scoring its performance on the test set, using the score method. What is the value returned? How can this score be interpreted? 
lm.score(X_test, y_test)

In [None]:
# c.	Compare the predicted values (predictions) against the values used for testing the model (y_test). You may create 2 separate plots using the X3 distance to the nearest MRT station as the independent variable.
sns.scatterplot(x=X_test['X3 distance to the nearest MRT station'], y=predictions)

# d.	Copy and paste visualization and describe what conclusions could be drawn from this visualization and the score obtained in the previous step.


In [None]:
sns.scatterplot(x=X_test['X3 distance to the nearest MRT station'], y=y_test)

In [None]:
# 27.	Using the Model to Make Price Predictions by creating a dictionary myTest with the input values for the model's features:
# a.	Use the following values: transaction=2013, house age=16, distance=2000, stores=4, latitude=24.94 and longitude=121.50
myTestMin = {
    'X1 transaction date': [2012],
    'X2 house age': [25],
    'X3 distance to the nearest MRT station': [5500],
    'X4 number of convenience stores': [2],
    'X5 latitude': [24.94],
    'X6 longitude': [121.44],
}

# b.	Convert the dictionary into a DataFrame named df.
df = pd.DataFrame(myTestMin)

# c.	Use the trained model to predict the price based on the values in df and print the result.
lm.predict(df)


In [None]:
myTestMax = {
    'X1 transaction date': [2013],
    'X2 house age': [5],
    'X3 distance to the nearest MRT station': [500],
    'X4 number of convenience stores': [9],
    'X5 latitude': [24.98],
    'X6 longitude': [121.55],
}

df = pd.DataFrame(myTestMax)
lm.predict(df)