In [None]:
import pandas as pd
from sklearn.datasets import load_boston
import numpy as np

# Load the dataset
boston = load_boston()

# Create a DataFrame
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target

# Display the first few rows
df.head()

In [None]:
# Check for missing values
df.isnull().sum()

# No missing values found in this dataset.


In [None]:
# Describe the dataset
df.describe()

# Observations: The 'PRICE' column has values ranging from 5 to 50. 
# The 'CRIM' feature has a max value of 88, indicating some outliers.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a histogram of house prices
plt.figure(figsize=(8, 6))
sns.histplot(df['PRICE'], kde=True)
plt.title('Distribution of House Prices')
plt.show()

# Observation: The distribution is skewed to the right.

In [None]:
# Create  plots
plt.figure(figsize=(15, 5))

# RM vs Price
plt.subplot(1, 3, 1)
sns.scatterplot(x=df['RM'], y=df['PRICE'])
plt.title('RM vs Price')

# LSTAT vs Price
plt.subplot(1, 3, 2)
sns.scatterplot(x=df['LSTAT'], y=df['PRICE'])
plt.title('LSTAT vs Price')

# PTRATIO vs Price
plt.subplot(1, 3, 3)
sns.scatterplot(x=df['PTRATIO'], y=df['PRICE'])
plt.title('PTRATIO vs Price')

plt.show()

# Observation: RM shows a positive correlation with price, while LSTAT shows a negative correlation.

In [None]:
# Correlation matrix
corr_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Observation: RM and LSTAT are highly correlated with PRICE.


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into features and target
X = df.drop('PRICE', axis=1)
y = df['PRICE']

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Observation: The MSE is relatively low, but there might still be room for improvement.