# Implement and demonstrate linear regression and variable correlation for House Price Prediction using sklearn. Read the training data from a BostonHouse.CSV file.

In [None]:
#import the necessary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
#load the dataset
df= pd.read_csv("/kaggle/input/boston-housing-dataset/BostonHousing.csv")

In [None]:
#to display the first few rows of dataframe.By default it give first five rows.
df.head()

**df.head(): Displays the first five rows of the DataFrame, providing an initial look at the data.**

In [None]:
#to display the last few rows. 
df.tail(10)

**df.tail(): Displays the last five rows of the DataFrame, providing an initial look at the data.**

In [None]:
#to check the number of rows and columns.
df.shape

In [None]:
#To check the null values
df.isnull()

In [None]:
df.dropna(inplace=True)

In [None]:
#Check for any missing values in dataset
#isnull() is use for getting null values
#sum() gives total number of null values
print(df.isnull().sum())

In [None]:

#Display the summary statistics of the dataset
##df.describe(): Provides a summary of statistics for each numerical column in the DataFrame.

# Display the summary statistics of the dataset

df.describe()

In [None]:
#describes the summary of dataframe. 
df.info()

In [None]:
#Visualize the distribution of the target variable (house prices)
##sns.histplot(): Plots a histogram for the target variable 'MEDV' (Median Value of Owner-Occupied Homes).
##kde=True: Adds a Kernel Density Estimate (KDE) line to the histogram for visualizing the data distribution.

plt.figure(figsize=(10,6))
sns.histplot(df['medv'],kde=True)
plt.title('Distribution of House Price')
plt.xlabel('House Pricing')
plt.ylabel('Frequency')
plt.show()

In [None]:

#Calculate the correlation matrix
##df.corr(): Computes the correlation matrix for the DataFrame, showing the correlation coefficients between variables.

# Calculate the correlation matrix

df.corr()

In [None]:
#visulaize the correlation matrix using heatmap() 
##sns.heatmap(): Creates a heatmap to visualize the correlation matrix.

plt.figure(figsize=(20,10))
tc=df.corr()
sns.heatmap(tc)

In [None]:
plt.figure(figsize=(20,10))
tc=df.corr().round(2)
sns.heatmap(tc,annot=True)

In [None]:
##sns.heatmap(): Creates a heatmap to visualize the correlation matrix.
##annot=True: Displays the correlation coefficient values in the heatmap.
##cmap='coolwarm': Sets the color map for the heatmap.

plt.figure(figsize=(20,10))
tc=df.corr().round(2)
sns.heatmap(tc,annot=True,cmap='coolwarm')
plt.title('correlation matrix')
plt.show()

In [None]:


target = tc['medv'].sort_values(ascending=False)

top_positive_correlations = tc[target > 0].head()

top_negative_correlations = tc[target < 0].tail()

print(" Positive Correlations:\n", top_positive_correlations)
print("\n Negative Correlations:\n", top_negative_correlations)

In [None]:


target_correlation = tc['medv'].sort_values(ascending=False)

top_positive_correlations = target_correlation[target > 0].head()

top_negative_correlations = target_correlation[target < 0].tail()

print(" Positive Correlations:\n", top_positive_correlations)
print("\n Negative Correlations:\n", top_negative_correlations)

In [None]:
target = tc['medv'].sort_values(ascending=False)

top_positive_correlations = target_correlation.head()

top_negative_correlations = target_correlation.tail()

print(" Positive Correlations:\n", top_positive_correlations)
print("\n Negative Correlations:\n", top_negative_correlations)

# Prepare the data for training the Linear Regression model

In [None]:
#Select the features and the target variable
#df.drop(): Removes the target variable 'MEDV' from the features DataFrame X.
#df['MEDV']: Selects the target variable y.

# Select the features and the target variable

X=df.drop('medv',axis=1)
y=df['medv']

**df.drop():** Removes the target variable 'MEDV' from the features DataFrame X.
**df['MEDV']:** Selects the target variable y.

In [None]:
#Split the dataset into training and testing sets

##train_test_split(): Splits the data into training and testing sets.
##test_size=0.2: Allocates 20% of the data for testing and 80% for training.
##random_state=42: Ensures reproducibility of the split.

from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**train_test_split():** Splits the data into training and testing sets.
**test_size=0.2:** Allocates 20% of the data for testing and 80% for training.
**random_state=42: **Ensures reproducibility of the split.

# Standardize the feature variables

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the feature variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**StandardScaler():** Standardizes features by removing the mean and scaling to unit variance.
**fit_transform():** Fits the scaler to the training data and transforms it.
**transform(): ** Transforms the testing data using the already fitted scaler.

# Train a Linear Regression model using the training data

In [None]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
#Display the model's coefficients and intercept

# Display the model's coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

**model.coef_:** Displays the coefficients of the linear model.
**model.intercept_:** Displays the intercept of the linear model.

# Evaluate the Model's Performance

> **Predict the House Prices Using the Testing Data**

In [None]:
# Predict the house prices using the testing data
y_pred = model.predict(X_test)
print(y_pred)

> **Calculate and Display Performance Metrics**

In [None]:
# Calculate and display the performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)

> ***Plot Predicted vs. Actual House Prices***

In [None]:
# Plot the predicted vs actual house prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel('Actual House Prices')
plt.ylabel('Predicted House Prices')
plt.title('Actual vs Predicted House Prices')
plt.show()