In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from scipy.stats import skew

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
data = pd.read_csv('housing.csv')
data.head()

In [None]:
data.columns.values

In [None]:
data.info()


In [None]:
data.isnull().sum()


In [None]:
data.describe()


In [None]:
plt.figure(figsize= (10, 6))
sns.histplot(data['total_bedrooms'], color = '#005b96', kde= True);

In [None]:
data['total_bedrooms'].fillna(data['total_bedrooms'].median(), inplace= True)

# EDA

In [None]:
plt.figure(figsize= (20, 8))
sns.heatmap(data.corr(), annot= True, cmap='YlGnBu')
plt.show()

In [None]:
sns.histplot(data['median_house_value'], color = '#005b96', kde= True);


In [None]:
data['median_house_value'].skew()


Our target variable is clearly skewed. Therefore we will apply log transformation to it later



In [None]:
data.hist(bins = 30, figsize=(20, 15), color = '#005b96');


In [None]:
sns.countplot(x = data['ocean_proximity'], palette= 'RdPu');


In [None]:
data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
        s=data["population"]/100, label="population", figsize=(15,8),
        c="median_house_value", cmap=plt.get_cmap("jet"),colorbar=True,
    )
plt.legend()
plt.show()

# Feature ENGINEERING

In [None]:
data['bed_per_room'] = data['total_bedrooms'] / data['total_rooms']


In [None]:
X = data.drop(['median_house_value'], axis=1)
y = np.log(data.median_house_value) # Applying log transformation

In [None]:
skew_df = pd.DataFrame(X.select_dtypes(np.number).columns, columns= ['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: skew(X[feature]))
skew_df['Abs_Skew'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Abs_Skew'].apply(lambda x: True if x > 0.5 else False)
skew_df

In [None]:
skewed_columns = skew_df[skew_df['Abs_Skew'] > 0.5]['Feature'].values
skewed_columns

In [None]:
for column in skewed_columns:
    X[column] = np.log(X[column])

In [None]:
encoder=LabelEncoder()
X['ocean_proximity']=encoder.fit_transform(X['ocean_proximity'])

In [None]:
X.head()

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), index= X.index, columns= X.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)


Linear Regression is a fundamental statistical modeling technique used to understand the relationship between a dependent variable and one or more independent variables. It aims to find the best-fit straight line that represents the linear relationship between the variables. Linear Regression is widely used for prediction, forecasting, and understanding the impact of variables on the target variable.

In Linear Regression, the dependent variable (also called the response variable or target variable) is predicted based on one or more independent variables (also called predictor variables or features). The relationship between the variables is assumed to be linear, meaning that the change in the dependent variable is proportional to the change in the independent variable(s).

The goal of Linear Regression is to estimate the coefficients of the line (slope and intercept) that minimize the difference between the predicted values and the actual values of the dependent variable. This is done using a technique called Ordinary Least Squares (OLS), which minimizes the sum of the squared differences between the predicted and actual values.

Once the coefficients are estimated, the linear equation can be used to make predictions on new data. The equation takes the form:
Y = b0 + b1X1 + b2X2 + ... + bn*Xn
where Y is the predicted value of the dependent variable, b0 is the intercept, b1, b2, ..., bn are the coefficients, and X1, X2, ..., Xn are the values of the independent variables.

Linear Regression has several assumptions, including linearity, independence of errors, homoscedasticity (constant variance of errors), and absence of multicollinearity (high correlation between independent variables). Violation of these assumptions can affect the accuracy and reliability of the model.

Linear Regression is widely used in various fields such as economics, social sciences, finance, and engineering. It serves as a basis for more advanced regression models and provides valuable insights into the relationships between variables.

Linear Regression is like drawing a straight line to predict something. Imagine you want to know how much you will weigh based on how tall you are. You can ask your friends who are different heights to weigh themselves, and then you draw a line that best fits their heights and weights. This line helps you predict how much you might weigh based on your own height.

In Linear Regression, we have a dependent variable (like weight) and one or more independent variables (like height). We want to find a straight line that best shows the relationship between them. The line should pass as close as possible to the points representing the actual weights of different people with different heights.

To find the line, we use a math trick called Ordinary Least Squares. This trick helps us find the line that minimizes the difference between the predicted weights and the actual weights. Once we have the line, we can use it to predict the weight of a person based on their height.

Linear Regression has some rules. It assumes that the relationship between the variables is a straight line. It also assumes that the errors (the differences between the predicted and actual weights) are random and have the same variability for all heights. It's also important that the heights of your friends are not very similar, otherwise, it can be tricky to draw an accurate line.

In summary, Linear Regression is a way to draw a straight line that helps us predict one variable based on another. It's like drawing a line through points to find a pattern. It's used in many fields to understand relationships between things and make predictions.

# Linear Regressor

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, predictions_lr))
r2 = r2_score(y_test, predictions_lr)

print('RMSE:', rmse)
print('R-square:', r2)

# KNN

In [None]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
predictions_knn = knn.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, predictions_knn))
r2 = r2_score(y_test, predictions_knn)

print('RMSE:', rmse)
print('R-square:', r2)

# Random Forest Regressor

In [None]:
rf = RandomForestRegressor(n_estimators= 100)
rf.fit(X_train, y_train)
predictions_rf = rf.predict(X_test)

In [None]:
from sklearn.tree import plot_tree

tree = rf.estimators_[0]
# Plotting the first decision tree
plt.figure(figsize=(30, 15))
plot_tree(tree, filled=True, rounded=True, max_depth=4, fontsize=14)
plt.show()