# Housing Dataset

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

In [2]:
df = pd.read_csv("/content/HousingData.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/HousingData.csv'

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
plt.figure(figsize=(20, 15))
sns.pairplot(df, x_vars=df.drop(columns='MEDV'), y_vars=['MEDV'])
plt.show()

In [None]:
sns.heatmap(df.corr()[['MEDV']], annot=True, cmap='coolwarm')

In [None]:
df_cleaned = df.drop(columns=['CHAS', 'DIS', 'B'])
df_cleaned.head()

In [None]:
sns.boxplot(df['CRIM'], orient='h')

In [None]:
Q1_crim = df['CRIM'].quantile(0.25)
Q3_crim = df['CRIM'].quantile(0.75)
IQR = Q3_crim - Q1_crim

lower_bound = IQR - 1.5 * Q1_crim
upper_bound = IQR + 1.5 * Q3_crim

outlier_mask = ~(df['CRIM'] < lower_bound) | ~(df['CRIM'] > upper_bound)
df_cleaned = df[outlier_mask]

In [None]:
sns.boxplot(df['LSTAT'], orient='h')

In [None]:
sns.boxplot(df['RM'], orient='h')

In [None]:
Q1_rm = df['RM'].quantile(0.25)
Q3_rm = df['RM'].quantile(0.75)
IQR_rm = Q3_rm - Q1_rm

lower_bound_rm = Q1_rm - 1.5 * IQR_rm
upper_bound_rm = Q3_rm + 1.5 * IQR_rm

outlier_mask = ~(df['RM'] < lower_bound) | ~(df['RM'] > upper_bound)

df_cleaned = df[~outlier_mask]

In [None]:
sns.boxplot(df['NOX'], orient='h')

In [None]:
sns.boxplot(df['ZN'], orient='h')

In [None]:
Q1_zn = df_cleaned['ZN'].quantile(0.25)
Q3_zn = df_cleaned['ZN'].quantile(0.75)
IQR_zn = Q3_zn - Q1_zn

lower_bound_zn = Q1_zn - 1.5 * IQR_zn
upper_bound_zn = Q3_zn + 1.5 * IQR_zn

df_cleaned['ZN'] = df_cleaned['ZN'].apply(lambda x: Q3_zn if x > upper_bound_zn else (Q1_zn if x < lower_bound_zn else x))

In [None]:
df_cleaned.info()

In [None]:
df_cleaned.fillna(df.mean(), inplace=True)
df_cleaned.info()

In [None]:
X = df_cleaned.drop(columns='MEDV')
y = df_cleaned['MEDV']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean absolute error: {mae}")
print(f"Mean squared error: {mse}")
print(f"Root mean squared error: {rmse}")
print(f"R squared Score: {r2}")