# Linear Regrssion on US Housing Price


In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [11]:
df = pd.read_csv("USA_Housing.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'USA_Housing.csv'

    Check basic info on the data set

    'info()' method to check the data types and number

In [None]:
df.info()

    'describe()' method to get the statistical summary of the various features of the data set

In [None]:
df.describe()

In [None]:
df.columns

### Basic plotting and visualization on the data set

In [None]:
sns.pairplot(df)

    Distribution of price 

In [None]:
df['Price'].plot.hist(bins=25,figsize=(8,4))

In [None]:
df['Price'].plot.density()

### Correlation matrix and heatmap

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(),annot=True,linewidths=2)

### Feature and variable sets
#### Make a list of data frame column names

In [None]:
l_column = list(df.columns) # Making a list out of column names
len_feature = len(l_column) # Length of column vector list
l_column

In [None]:
l_column[0:len_feature-2]

### Put all the numerical features in X and Price in y, ignore Address which is string for linear regression

In [None]:
X = df[l_column[0:len_feature-2]]
y = df[l_column[len_feature-2]]

In [None]:
y

In [None]:
print("Feature set size:",X.shape)
print("Variable set size:",y.shape)

In [None]:
X.head()

In [None]:
X = X.drop(['Unnamed: 0'], axis = 1)

In [None]:
X.head()

In [None]:
y.head()

### Test-train split

#### Import train_test_split function from scikit-learn

In [None]:
from sklearn.model_selection import train_test_split

### Create X and y train and test splits in one command using a split ratio and a random seed

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

#### Check the size and shape of train/test splits (it should be in the ratio as per test_size parameter above)

In [None]:
print("Training feature set size:",X_train.shape)
print("Test feature set size:",X_test.shape)
print("Training variable set size:",y_train.shape)
print("Test variable set size:",y_test.shape)

### Model fit and training

#### Import linear regression model estimator from scikit-learn and instantiate

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
lm = LinearRegression() # Creating a Linear Regression object 'lm'

#### Fit the model on to the instantiated object itself

In [None]:
lm.fit(X_train,y_train) # Fit the linear model on to the 'lm' object itself i.e. no need to set this to another variable

#### Check the intercept and coefficients and put them in a DataFrame

In [None]:
print("The intercept term of the linear model:", lm.intercept_)

In [None]:
X.columns

In [None]:
print("The coefficients of the linear model:", lm.coef_)

In [None]:
cdf = pd.DataFrame(data=lm.coef_, index=X_train.columns, columns=["Coefficients"])
cdf

### Calculation of standard errors and t-statistic for the coefficients

In [None]:
lm.predict()

In [None]:
n=X_train.shape[0]
k=X_train.shape[1]
dfN = n-k
train_pred=lm.predict(X_train)
train_error = np.square(train_pred - y_train)
sum_error=np.sum(train_error)
se=[0,0,0,0,0]
for i in range(k):
    r = (sum_error/dfN)
    r = r/np.sum(np.square(X_train[list(X_train.columns)[i]]-X_train[list(X_train.columns)[i]].mean()))
    se[i]=np.sqrt(r)
cdf['Standard Error']=se
cdf['t-statistic']=cdf['Coefficients']/cdf['Standard Error']
cdf

In [None]:
print("Therefore, features arranged in the order of importance for predicting the house price\n",'-'*90,sep='')
l=list(cdf.sort_values('t-statistic',ascending=False).index)
print(' > \n'.join(l))

In [None]:
l=list(cdf.index)
from matplotlib import gridspec
fig = plt.figure(figsize=(18, 10))
gs = gridspec.GridSpec(2,3)
#f, ax = plt.subplots(nrows=1,ncols=len(l), sharey=True)
ax0 = plt.subplot(gs[0])
ax0.scatter(df[l[0]],df['Price'])
ax0.set_title(l[0]+" vs. Price", fontdict={'fontsize':20})

ax1 = plt.subplot(gs[1])
ax1.scatter(df[l[1]],df['Price'])
ax1.set_title(l[1]+" vs. Price",fontdict={'fontsize':20})

ax2 = plt.subplot(gs[2])
ax2.scatter(df[l[2]],df['Price'])
ax2.set_title(l[2]+" vs. Price",fontdict={'fontsize':20})

ax3 = plt.subplot(gs[3])
ax3.scatter(df[l[3]],df['Price'])
ax3.set_title(l[3]+" vs. Price",fontdict={'fontsize':20})

ax4 = plt.subplot(gs[4])
ax4.scatter(df[l[4]],df['Price'])
ax4.set_title(l[4]+" vs. Price",fontdict={'fontsize':20})

### R-square of the model fit

In [None]:
print("R-squared value of this fit:",round(metrics.r2_score(y_train,train_pred),3))

#### Prediction, error estimate, and regression evaluation matrices

#### Prediction using the lm model

In [None]:
predictions = lm.predict(X_test)
print ("Type of the predicted object:", type(predictions))
print ("Size of the predicted object:", predictions.shape)

#### Scatter plot of predicted price and y_test set to see if the data fall on a 45 degree straight line

In [None]:
plt.figure(figsize=(10,7))
plt.title("Actual vs. predicted house prices",fontsize=25)
plt.xlabel("Actual test set house prices",fontsize=18)
plt.ylabel("Predicted house prices", fontsize=18)
plt.scatter(x=y_test,y=predictions)

#### Plotting histogram of the residuals i.e. predicted errors (expect a normally distributed pattern)

In [None]:
plt.figure(figsize=(10,7))
plt.title("Histogram of residuals to check for normality",fontsize=25)
plt.xlabel("Residuals",fontsize=18)
plt.ylabel("Kernel density", fontsize=18)
sns.distplot([y_test-predictions])

#### Scatter plot of residuals and predicted values (Homoscedasticity)

In [None]:
plt.figure(figsize=(10,7))
plt.title("Residuals vs. predicted values plot (Homoscedasticity)\n",fontsize=25)
plt.xlabel("Predicted house prices",fontsize=18)
plt.ylabel("Residuals", fontsize=18)
plt.scatter(x=predictions,y=y_test-predictions)

#### Regression evaluation metrices

In [None]:
print("Mean absolute error (MAE):", metrics.mean_absolute_error(y_test,predictions))
print("Mean square error (MSE):", metrics.mean_squared_error(y_test,predictions))
print("Root mean square error (RMSE):", np.sqrt(metrics.mean_squared_error(y_test,predictions)))

#### R-square value

In [None]:
print("R-squared value of predictions:",round(metrics.r2_score(y_test,predictions),3))