In [135]:
# Import the modules
import pandas as pd
from pathlib import Path
import hvplot.pandas

# Import the K-means algorithm
from sklearn.cluster import KMeans
from sklearn import linear_model
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


## Split the Data into Training and Testing Sets

### Step 1: Read the `neighborhood.csv` data into a Pandas DataFrame.

In [136]:
# Read in the CSV file as a Pandas DataFrame
home_prediction = pd.read_csv(Path("neighborhoods.csv"))

# Review the DataFrame
home_prediction.head()


Unnamed: 0,RegionID,Neighborhood,State,City,Metro,County,2019_01_31,2019_02_28,2019_03_31,2019_04_30,...,2022_05_31,2022_06_30,2022_07_31,2022_08_31,2022_09_30,2022_10_31,2022_11_30,2022_12_31,2023_01_31,2023_02_28
0,118208,South Los Angeles,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,506642.0,502524.0,497331.0,493370.0,...,671146.0,674242.0,677691.0,678758.0,679208.0,677960.0,677351.0,674928.0,667856.0,658327.0
1,268496,Southeast Los Angeles,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,425679.0,422895.0,418987.0,416306.0,...,569715.0,572643.0,576533.0,579171.0,580886.0,580609.0,580315.0,578471.0,573768.0,566632.0
2,273565,East San Jose,CA,San Jose,"San Jose-Sunnyvale-Santa Clara, CA",Santa Clara County,793552.0,786114.0,777802.0,770834.0,...,978476.0,979334.0,974156.0,965418.0,955752.0,946936.0,941145.0,937052.0,930206.0,919784.0
3,273088,Bullard,CA,Fresno,"Fresno, CA",Fresno County,323923.5,326059.5,328003.5,329031.5,...,456946.5,461154.0,462796.0,461944.0,459337.5,456939.5,454922.5,452773.0,449136.5,446411.0
4,276652,West San Jose,CA,San Jose,"San Jose-Sunnyvale-Santa Clara, CA",Santa Clara County,1457490.0,1434139.0,1408370.0,1385831.0,...,1803243.0,1804447.0,1792097.0,1773101.0,1753691.0,1741227.0,1739328.0,1741457.0,1735428.0,1714749.0


In [137]:
# Remove the RegionID and State columns
home_prediction_df = home_prediction.drop(columns = ['RegionID','State'])
home_prediction_df.head(5)


Unnamed: 0,Neighborhood,City,Metro,County,2019_01_31,2019_02_28,2019_03_31,2019_04_30,2019_05_31,2019_06_30,...,2022_05_31,2022_06_30,2022_07_31,2022_08_31,2022_09_30,2022_10_31,2022_11_30,2022_12_31,2023_01_31,2023_02_28
0,South Los Angeles,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,506642.0,502524.0,497331.0,493370.0,490663.0,488695.0,...,671146.0,674242.0,677691.0,678758.0,679208.0,677960.0,677351.0,674928.0,667856.0,658327.0
1,Southeast Los Angeles,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,425679.0,422895.0,418987.0,416306.0,414891.0,413916.0,...,569715.0,572643.0,576533.0,579171.0,580886.0,580609.0,580315.0,578471.0,573768.0,566632.0
2,East San Jose,San Jose,"San Jose-Sunnyvale-Santa Clara, CA",Santa Clara County,793552.0,786114.0,777802.0,770834.0,765129.0,759231.0,...,978476.0,979334.0,974156.0,965418.0,955752.0,946936.0,941145.0,937052.0,930206.0,919784.0
3,Bullard,Fresno,"Fresno, CA",Fresno County,323923.5,326059.5,328003.5,329031.5,329221.5,329165.0,...,456946.5,461154.0,462796.0,461944.0,459337.5,456939.5,454922.5,452773.0,449136.5,446411.0
4,West San Jose,San Jose,"San Jose-Sunnyvale-Santa Clara, CA",Santa Clara County,1457490.0,1434139.0,1408370.0,1385831.0,1368964.0,1353640.0,...,1803243.0,1804447.0,1792097.0,1773101.0,1753691.0,1741227.0,1739328.0,1741457.0,1735428.0,1714749.0


## Prepare the Data to Fit the Linear Regression Model

In [138]:
# Create the X set by using the `reshape` function to format the historical prices data as a single column array.
X = home_prediction_df.drop(columns=[
    'Neighborhood', 'City', 'Metro', 'County', '2022_03_31', '2022_04_30', '2022_05_31', '2022_06_30', '2022_07_31', '2022_08_31', '2022_09_30', '2022_10_31', '2022_11_30', '2022_12_31', '2023_01_31', '2023_02_28'])



In [139]:
# Create an array for the dependent variable y with the 2023_02_28 data
y = home_prediction_df['2023_02_28']


## Build the Linear Regression Model

In [140]:
# From the sklearn module, use the LinearRegression() to create a linear regression object
regr = linear_model.LinearRegression()

# Fit the model, taking the independent and dependent values as parameters
# filling the regression object with data that describes the relationship, and ready to make the predictions:
regr.fit(X, y)


LinearRegression()

In [141]:
# Display the slopes
print(f"Model's slope: {regr.coef_}")


Model's slope: [ -1.71497976   1.68916145   2.0832252   -6.1570759    6.00658627
   2.05981048  -7.02395699   5.17639755  -3.97980178  -0.90773406
   1.88078665   5.03886479  -8.54653639   5.20752871   0.93936569
  -4.67829858   4.97090611  -6.81193462   4.51762269   7.62046893
 -16.95739618  12.21201871  -0.02167611  -8.57934131   9.85105776
   0.90553576 -14.46021156  16.16686969  -4.45539584  -7.54565336
   9.13270509   0.30440573 -14.09076969  13.54888903   0.12784387
  -6.99624124   1.76224238   2.65411565]


In [142]:
# Display the y-intercept
print(f"Model's y-intercept: {regr.intercept_}")

Model's y-intercept: 15469.361294947681


In [143]:
# Display the model's best fit line formula
print(f"Model's formula: y = {regr.intercept_} + {regr.coef_[0]}X")


Model's formula: y = 15469.361294947681 + -1.7149797624769825X


In [144]:
# Make predictions using the X set
predicted2023_02_28 = regr.predict(home_prediction_df.drop(columns=[
    'Neighborhood', 'City', 'Metro', 'County', '2022_03_31', '2022_04_30', '2022_05_31', '2022_06_30', '2022_07_31', '2022_08_31', '2022_09_30', '2022_10_31', '2022_11_30', '2022_12_31', '2023_01_31', '2023_02_28']))
print(Predicted2023_02_28 )

[ 669385.23649141  574617.76936018  938783.79332975 ...  818390.92593494
 1301334.36593653  543716.86302433]


In [145]:
# Create a copy of the original data
home_sales_predicted = home_prediction_df.copy()


In [146]:
# Since the round function is not supported in a NumPy array
# create an empty list and loop through the results of the regression model prediction
# # Append the rounded values to an empty list 
predicted_prices = []
for i in range(len(predicted2023_02_28)):
    rounded_value = round(predicted2023_02_28[i], 1)
    home_sales_predicted.loc[i, "2023_02_28_predicted"] = rounded_value
    predicted_prices.append(rounded_value)

print(predicted_prices)


[669385.2, 574617.8, 938783.8, 453055.8, 1689615.1, 1186021.2, 672612.1, 1474780.8, 1108102.8, 627070.4, 1071394.2, 1021067.6, 711891.0, 859121.4, 1223767.9, 642352.5, 1312785.1, 733104.6, 899887.5, 767534.5, 637245.8, 651777.3, 733086.2, 640802.9, 779423.2, 444227.5, 311679.5, 561427.5, 376118.3, 2242613.9, 767074.0, 763306.6, 1026757.2, 296285.1, 636337.7, 754577.2, 1417993.9, 1463098.8, 700403.8, 1113799.9, 1056835.2, 752416.4, 971329.6, 1238785.8, 1203440.8, 1943012.1, 804434.2, 1537493.0, 734601.3, 853455.5, 529353.6, 1439784.6, 1280455.1, 796122.5, 912693.1, 1724115.8, 1527985.5, 923055.7, 1558583.2, 689643.1, 1330458.3, 770509.0, 900395.6, 1354602.8, 713530.7, 780906.6, 1356361.0, 874325.9, 1993759.3, 2437319.5, 1073231.9, 670244.6, 916828.5, 1161418.2, 660067.1, 998456.3, 881046.9, 1587724.1, 1134899.0, 663021.4, 847872.4, 1895573.0, 1310012.4, 885571.3, 763084.1, 873935.0, 720687.6, 1976639.8, 730668.1, 1635423.5, 1117277.0, 972724.4, 603821.2, 636668.4, 1065354.2, 998772.4, 1

In [147]:
# Add a column with the home prices
home_sales_predicted["2023_02_28_predicted"] = predicted_prices


In [148]:

# Display sample data
home_sales_predicted .head()


Unnamed: 0,Neighborhood,City,Metro,County,2019_01_31,2019_02_28,2019_03_31,2019_04_30,2019_05_31,2019_06_30,...,2022_06_30,2022_07_31,2022_08_31,2022_09_30,2022_10_31,2022_11_30,2022_12_31,2023_01_31,2023_02_28,2023_02_28_predicted
0,South Los Angeles,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,506642.0,502524.0,497331.0,493370.0,490663.0,488695.0,...,674242.0,677691.0,678758.0,679208.0,677960.0,677351.0,674928.0,667856.0,658327.0,669385.2
1,Southeast Los Angeles,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,425679.0,422895.0,418987.0,416306.0,414891.0,413916.0,...,572643.0,576533.0,579171.0,580886.0,580609.0,580315.0,578471.0,573768.0,566632.0,574617.8
2,East San Jose,San Jose,"San Jose-Sunnyvale-Santa Clara, CA",Santa Clara County,793552.0,786114.0,777802.0,770834.0,765129.0,759231.0,...,979334.0,974156.0,965418.0,955752.0,946936.0,941145.0,937052.0,930206.0,919784.0,938783.8
3,Bullard,Fresno,"Fresno, CA",Fresno County,323923.5,326059.5,328003.5,329031.5,329221.5,329165.0,...,461154.0,462796.0,461944.0,459337.5,456939.5,454922.5,452773.0,449136.5,446411.0,453055.8
4,West San Jose,San Jose,"San Jose-Sunnyvale-Santa Clara, CA",Santa Clara County,1457490.0,1434139.0,1408370.0,1385831.0,1368964.0,1353640.0,...,1804447.0,1792097.0,1773101.0,1753691.0,1741227.0,1739328.0,1741457.0,1735428.0,1714749.0,1689615.1


## Assess the Linear Regression Model

In [149]:
# Compute the metrics for the linear regression model
score = regr.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted2023_02_28)
mse = mean_squared_error(y, predicted2023_02_28)
rmse = np.sqrt(mse)
std = np.std(y)

# Print releveant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")


The score is 0.9979566102270883.
The r2 is 0.9979566102270883.
The mean squared error is 1157009798.1116357.
The root mean squared error is 34014.84673067976.
The standard deviation is 752476.446824234.


#### The above metrics reflect that the model is a good model since the standard deviation is greater than the root mean squared error
#### And the regression coefficient of approximately 99.79566%  expresses the high explanatory power of the model in forecasting home prices

In [155]:
# linreg=LinearRegression()
# linreg.fit(x_train,y_train)

### Split the data into training and testing datasets by using `train_test_split`.