In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

features = ['beds', 'baths', 'size']
target = 'price'

X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')

predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
predictions['Residual'] = predictions['Actual'] - predictions['Predicted']

print(predictions.head())

fig = px.scatter(predictions, x='Actual', y='Predicted', title='Actual vs Predicted Prices',
                 labels={'Actual': 'Actual Price', 'Predicted': 'Predicted Price'},
                 trendline='ols')

fig.add_shape(
    type='line',
    x0=predictions['Actual'].min(), y0=predictions['Actual'].min(),
    x1=predictions['Actual'].max(), y1=predictions['Actual'].max(),
    line=dict(color='Red', dash='dash')
)

fig.update_layout(showlegend=False)
fig.show()

heatmap_data = predictions.pivot_table(index=X_test.index, values='Residual')
heatmap_data = heatmap_data.T  

fig_heatmap = go.Figure(data=go.Heatmap(
    z=heatmap_data.values,
    x=heatmap_data.columns,
    y=heatmap_data.index,
    colorscale='Viridis'
))

fig_heatmap.update_layout(
    title='Heatmap of Residuals',
    xaxis_title='Index',
    yaxis_title='Residuals'
)

fig_heatmap.show()

correlation_matrix = train_df[features + [target]].corr()

print(correlation_matrix)

fig_corr = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns.tolist(),
    y=correlation_matrix.index.tolist(),
    colorscale='Viridis',
    annotation_text=correlation_matrix.round(2).values,
    showscale=True
)

fig_corr.update_layout(
    title='Correlation Matrix Heatmap',
    xaxis_title='Features',
    yaxis_title='Features'
)

fig_corr.show()
df_for_plotting = train_df[features + [target]]

fig_scatter_matrix = ff.create_scatterplotmatrix(
    df_for_plotting,
    diag='histogram',
    index=target,
    colormap='Viridis',
    colormap_type='cat',
    title='Scatter Matrix of Features and Target'
)

fig_scatter_matrix.update_layout(
    height=800,
    width=800
)

fig_scatter_matrix.show()


Mean Squared Error: 173191146953.27536
Root Mean Squared Error: 416162.404540914
      Actual     Predicted       Residual
0  1175000.0  1.587668e+06 -412668.297025
1  1057500.0  1.715529e+06 -658028.514384
2   799000.0  6.344356e+05  164564.369389
3   565000.0  1.262320e+06 -697319.824716
4  1187000.0  1.148922e+06   38077.617217


           beds     baths      size     price
beds   1.000000  0.652853  0.771929  0.293516
baths  0.652853  1.000000  0.667655  0.317325
size   0.771929  0.667655  1.000000  0.444140
price  0.293516  0.317325  0.444140  1.000000
