In [4]:
import pandas as pd # data processing
import numpy as np # working with arrays
import matplotlib.pyplot as plt # visualization
import seaborn as sb # visualization
from termcolor import colored as cl # text customization

from sklearn.model_selection import train_test_split # data split

from sklearn.linear_model import LinearRegression # OLS algorithm
from sklearn.linear_model import Ridge # Ridge algorithm
from sklearn.linear_model import Lasso # Lasso algorithm
from sklearn.linear_model import BayesianRidge # Bayesian algorithm
from sklearn.linear_model import ElasticNet # ElasticNet algorithm

from sklearn.metrics import explained_variance_score as evs # evaluation metric
from sklearn import preprocessing # scaling: normalization
import pickle


sb.set_style('whitegrid') # plot style
plt.rcParams['figure.figsize'] = (20, 10) # plot size


df = pd.read_csv('traintest.csv')

# detecting null values
#df.dropna(inplace = True)
#print(df.isnull().sum())

# getting mean, median, standard deviation, and so on
#df.describe()

# in  it is essential to change float types to integer types because linear regression is supported only on integer type variables

df['price'] = df['price'].astype('int64')
df['bathrooms'] = df['bathrooms'].astype('int64')
df['floors'] = df['floors'].astype('int64')

# to estimate building age: 2021 - yr_built
df['yr_built'] =  2021 - df['yr_built']
# computing number of rows 
#print(df.dtypes)
#Data visualization: heatmap, scatter plot, and a distribution plot

#HeatMAp
#sb.heatmap(df.corr(), annot = True, cmap = 'magma')
#plt.savefig('heatmap.png')
#plt.show()

#scatter plot
#plt.scatter(df['price'], df['yr_built'])
#plt.show()

# correlation analysis for feature selection: we select feature which have high correlation value: sqft_living, sqft_above, sqft_living15, bathrooms    
#corr_matrix = df.corr()
#corr_matrix['price'].sort_values(ascending=False)

# distribution of our dependent variable (price)
#sb.distplot(df['price'], color = 'r')
#plt.show()

#feature selection, train,test
x_var = df[['sqft_living', 'sqft_above', 'sqft_living15', 'bathrooms']].values
y_var = df[['price']].values


x_train, x_test, y_train, y_test = train_test_split(x_var, y_var, test_size = 0.2, random_state = 0)

#Modeling: OLS

ols = LinearRegression()
ols.fit(x_train, y_train)
ols_yhat = ols.predict(x_test)

#Model Evaluation

print(cl('Explained Variance Score of OLS model is {}'.format(evs(y_test, ols_yhat)), attrs = ['bold']))
pickle.dump(ols, open("ols.pkl", "wb"))
