In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func

In [None]:
username="postgres"
password="BrAcRfSky6syf8M"
db_string = f"postgresql://{username}:{password}@database-group8.cinukpl9yba8.us-east-2.rds.amazonaws.com"

In [None]:
engine = create_engine(db_string)

In [None]:
# read in data by borough and year, add year column, select target columns,
# concatenate by borough then concatenate for whole city with new index

boroughs = ["Bronx", "Brooklyn", "Manhattan", "Queens", "StatenIsland"]
city_df = []
1
for borough in boroughs:
  city_df.append(pd.read_sql(f'SELECT * FROM public.\"{borough}Full\"',engine))

city_df = pd.concat(city_df,ignore_index=True)

city_df = city_df[["Borough", "Neighborhood", "Building_Class_Category", "Address", "Apartment_Number", 
                            "Zip_Code", "Residential_Units", 
                            "Gross_Square_Feet", 
                            "Sale_Price", "Sale_Date"]]
    

In [None]:
city_df.head()

Unnamed: 0,Borough,Neighborhood,Building_Class_Category,Address,Apartment_Number,Zip_Code,Residential_Units,Gross_Square_Feet,Sale_Price,Sale_Date
0,2,BATHGATE,01 ONE FAMILY DWELLINGS,412 EAST 179 STREET,,10457,1,2048,0,2017-04-04
1,2,BATHGATE,01 ONE FAMILY DWELLINGS,410 EAST 182ND STREET,,10457,1,1460,305000,2017-07-18
2,2,BATHGATE,01 ONE FAMILY DWELLINGS,412 EAST 182 STREET,,10457,1,1440,178000,2017-01-19
3,2,BATHGATE,01 ONE FAMILY DWELLINGS,412 EAST 182 STREET,,10457,1,1440,449000,2017-07-14
4,2,BATHGATE,01 ONE FAMILY DWELLINGS,4455 PARK AVENUE,,10457,1,1587,140000,2017-05-12


In [None]:
city_df["Year"] = pd.DatetimeIndex(city_df["Sale_Date"]).year
city_df["Month"] = pd.DatetimeIndex(city_df["Sale_Date"]).month

In [None]:
# drop value incompatible with integer 
#value = city_df[city_df["Sale_Price"]=="2210000000"]
#value
#city_df.drop(city_df[city_df["Sale_Price"]=="2210000000"].index, inplace=True)

In [None]:
city_df.dtypes

Borough                     int64
Neighborhood               object
Building_Class_Category    object
Address                    object
Apartment_Number           object
Zip_Code                    int64
Residential_Units           int64
Gross_Square_Feet           int64
Sale_Price                  int64
Sale_Date                  object
Year                        int64
Month                       int64
dtype: object

In [None]:
null_summary = city_df.isnull().groupby(by=[city_df.Borough, city_df.Year], dropna=False).sum()
pd.set_option('max_rows', 100)

In [None]:
city_df.Building_Class_Category.unique()

array(['01 ONE FAMILY DWELLINGS                    ',
       '02 TWO FAMILY DWELLINGS                    ',
       '03 THREE FAMILY DWELLINGS                  ',
       '07 RENTALS - WALKUP APARTMENTS             ',
       '09 COOPS - WALKUP APARTMENTS               ',
       '10 COOPS - ELEVATOR APARTMENTS             ',
       '14 RENTALS - 4-10 UNIT                     ',
       '21 OFFICE BUILDINGS                        ',
       '22 STORE BUILDINGS                         ',
       '27 FACTORIES                               ',
       '29 COMMERCIAL GARAGES                      ',
       '30 WAREHOUSES                              ',
       '31 COMMERCIAL VACANT LAND                  ',
       '37 RELIGIOUS FACILITIES                    ',
       '04 TAX CLASS 1 CONDOS                      ',
       '05 TAX CLASS 1 VACANT LAND                 ',
       '06 TAX CLASS 1 - OTHER                     ',
       '08 RENTALS - ELEVATOR APARTMENTS           ',
       '41 TAX CLASS 4 - OTH

In [None]:
city_df.columns

Index(['Borough', 'Neighborhood', 'Building_Class_Category', 'Address',
       'Apartment_Number', 'Zip_Code', 'Residential_Units',
       'Gross_Square_Feet', 'Sale_Price', 'Sale_Date', 'Year', 'Month'],
      dtype='object')

In [None]:
# check if Residential_Units > 0 filters out non residential building class categories

# codes of unique building categories for whole df
total_cat = city_df.Building_Class_Category.str[:2].unique().tolist()

# codes of unique building categories where Residential_Units > 0
res_filtered = city_df[(city_df.Residential_Units > 0)]
res_filtered = res_filtered.Building_Class_Category.str[:2].unique().tolist()

# list of codes in total_cat not in res_filtered
res_class = [x for x in total_cat if x not in res_filtered]
print(res_class)

# Residential_Unites > 0 does not filter out non residential building class categories

['47', '18', '24', '49', '42', '45']


In [None]:
# drop Residential_Units Column
city_df.drop('Residential_Units', axis=1, inplace=True)

In [None]:
# drop rows with sqf == 0
city_df = city_df[(city_df.Gross_Square_Feet > 0)]

In [None]:
 # drop rows with prive < 1000
 city_df = city_df[(city_df.Sale_Price > 1000)]

In [None]:
# filter for target residential_classes
residential_classes = ["01", "02", "03", "04", "06", "07", "08", "09", "10", "11", 
                         "12", "13", "14", "15", "16", "17", "23", "45", "49"]
city_df = city_df[city_df["Building_Class_Category"].str[:2].isin(residential_classes)]

In [None]:
city_df.Building_Class_Category.unique()

array(['01 ONE FAMILY DWELLINGS                    ',
       '02 TWO FAMILY DWELLINGS                    ',
       '03 THREE FAMILY DWELLINGS                  ',
       '07 RENTALS - WALKUP APARTMENTS             ',
       '14 RENTALS - 4-10 UNIT                     ',
       '08 RENTALS - ELEVATOR APARTMENTS           ',
       '09 COOPS - WALKUP APARTMENTS               ',
       '10 COOPS - ELEVATOR APARTMENTS             ',
       '06 TAX CLASS 1 - OTHER                     ',
       '01  ONE FAMILY DWELLINGS                    ',
       '02  TWO FAMILY DWELLINGS                    ',
       '03  THREE FAMILY DWELLINGS                  ',
       '07  RENTALS - WALKUP APARTMENTS             ',
       '08  RENTALS - ELEVATOR APARTMENTS           ',
       '14  RENTALS - 4-10 UNIT                     ',
       '10  COOPS - ELEVATOR APARTMENTS             ',
       '23  LOFT BUILDINGS                          ',
       '06  TAX CLASS 1 - OTHER                     ',
       '09  COOPS -

In [None]:
# dataframe with upper and lower quantile for each borough and year
city_df["$sqf"] = city_df["Sale_Price"]/city_df["Gross_Square_Feet"]
lower_quant = city_df.groupby(["Borough", "Year"])["$sqf"].quantile(0.05)
upper_quant = city_df.groupby(["Borough", "Year"])["$sqf"].quantile(0.95)
quant_df = pd.concat([lower_quant, upper_quant], join="outer",axis=1, keys=["lower", "upper"])
quant_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lower,upper
Borough,Year,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2003,2.654499,1005.813794
1,2004,2.246501,1156.235091
1,2005,11.591491,1289.434669
1,2006,12.67495,1441.084313
1,2007,28.830715,1960.421904


In [None]:
city_df[["Year", "$sqf"]].groupby(["Year"]).describe()

Unnamed: 0_level_0,$sqf,$sqf,$sqf,$sqf,$sqf,$sqf,$sqf,$sqf
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2003,42135.0,302.446166,10797.880889,0.032475,121.666667,175.177305,233.198952,1500000.0
2004,46818.0,346.431602,19802.950856,0.014133,145.955474,204.761498,268.41941,3800000.0
2005,45309.0,559.950528,26218.737183,0.042885,175.852268,240.590184,314.655172,3914950.0
2006,40835.0,448.786294,15580.248035,0.083206,194.886404,263.333333,341.513284,2343819.0
2007,31843.0,471.269356,13785.236745,0.079915,200.226359,270.292208,355.337931,1900000.0
2008,23683.0,368.584505,13052.40509,0.06213,182.807285,251.666667,333.846198,2000000.0
2009,20664.0,303.14311,7547.81407,0.104436,163.413044,230.983922,309.214623,1085000.0
2010,21189.0,424.173102,9035.117068,0.092903,156.25,228.8125,312.5,690000.0
2011,19060.0,281.949256,2040.796368,0.086574,151.791875,225.0,311.856942,232500.0
2012,21548.0,309.167406,3670.432499,0.037693,155.343707,231.812188,325.592569,515125.0


In [None]:
# Adding columns with upper and lower $/sqf quantiles 
city_df["Lower_Quant"] = city_df.groupby(["Borough", "Year"])["$sqf"].transform(lambda x: x.quantile(0.05))
city_df["Upper_Quant"] = city_df.groupby(["Borough", "Year"])["$sqf"].transform(lambda x: x.quantile(0.95))
city_df.shape

(447752, 14)

In [None]:
# properties filtered out by quantiles
rejected = city_df[(city_df["$sqf"]<city_df["Lower_Quant"]) | (city_df["$sqf"]>city_df["Upper_Quant"])]
rejected.head()

Unnamed: 0,Borough,Neighborhood,Building_Class_Category,Address,Apartment_Number,Zip_Code,Gross_Square_Feet,Sale_Price,Sale_Date,Year,Month,$sqf,Lower_Quant,Upper_Quant
4,2,BATHGATE,01 ONE FAMILY DWELLINGS,4455 PARK AVENUE,,10457,1587,140000,2017-05-12,2017,5,88.216761,90.414954,435.535714
9,2,BATHGATE,01 ONE FAMILY DWELLINGS,1948 BATHGATE AVENUE,,10457,2047,127000,2017-08-22,2017,8,62.042013,90.414954,435.535714
21,2,BATHGATE,02 TWO FAMILY DWELLINGS,466 EAST 185 STREET,,10458,1944,875025,2017-02-06,2017,2,450.115741,90.414954,435.535714
28,2,BATHGATE,02 TWO FAMILY DWELLINGS,507 EAST 183 STREET,,10458,2805,18000,2017-06-07,2017,6,6.417112,90.414954,435.535714
77,2,BAYCHESTER,01 ONE FAMILY DWELLINGS,1409 OAKLEY STREET,,10469,1395,10000,2017-09-13,2017,9,7.168459,90.414954,435.535714


In [None]:
# Select values that fall between quantiles
city_df = city_df[(city_df["$sqf"]>city_df["Lower_Quant"]) & (city_df["$sqf"]<city_df["Upper_Quant"])]
city_df.shape

(402900, 14)

In [None]:
city_df[["Year", "$sqf"]].groupby(["Year"]).describe()

Unnamed: 0_level_0,$sqf,$sqf,$sqf,$sqf,$sqf,$sqf,$sqf,$sqf
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2003,37915.0,180.934275,77.161598,2.656748,127.777778,175.175175,225.903614,1005.373303
2004,42132.0,211.602749,86.971465,2.282504,152.744878,204.761498,260.416667,1150.793651
2005,40773.0,249.220193,96.635548,11.65057,183.333333,240.590184,305.357143,1285.027696
2006,36747.0,272.436045,104.087554,12.745706,202.405659,263.333333,331.8334,1440.635867
2007,28654.0,286.004425,132.61305,29.546563,208.398262,270.284185,344.387755,1959.459459
2008,21308.0,267.540801,132.070075,10.884438,190.839695,251.666667,323.83296,2083.333333
2009,18594.0,242.942032,112.878709,2.518892,171.475805,230.983922,299.744898,1771.703366
2010,19062.0,245.659918,135.267597,29.182259,164.931013,228.800019,302.036757,1973.886329
2011,17149.0,244.532723,138.108457,32.284428,160.25641,225.0,300.0,1800.271739
2012,19390.0,258.790425,164.544312,47.061524,163.904863,231.812188,313.617103,2074.652778


In [None]:
# of duplicated_addresses
duplicated_address = city_df["Address"][(city_df["Address"].duplicated()==True)]
duplicated_address.shape

(72585,)

In [None]:
# number of apartments can't fill duplicated addresses
apt_df = city_df["Apartment_Number"].unique()
apt_df.shape

(294,)

In [None]:
city_df.drop(["Apartment_Number"], axis=1)

Unnamed: 0,Borough,Neighborhood,Building_Class_Category,Address,Zip_Code,Gross_Square_Feet,Sale_Price,Sale_Date,Year,Month,$sqf,Lower_Quant,Upper_Quant
1,2,BATHGATE,01 ONE FAMILY DWELLINGS,410 EAST 182ND STREET,10457,1460,305000,2017-07-18,2017,7,208.904110,90.414954,435.535714
2,2,BATHGATE,01 ONE FAMILY DWELLINGS,412 EAST 182 STREET,10457,1440,178000,2017-01-19,2017,1,123.611111,90.414954,435.535714
3,2,BATHGATE,01 ONE FAMILY DWELLINGS,412 EAST 182 STREET,10457,1440,449000,2017-07-14,2017,7,311.805556,90.414954,435.535714
5,2,BATHGATE,01 ONE FAMILY DWELLINGS,4445 PARK AVENUE,10457,1497,246000,2017-11-06,2017,11,164.328657,90.414954,435.535714
6,2,BATHGATE,01 ONE FAMILY DWELLINGS,4348 PARK AVENUE,10457,1764,420000,2017-02-03,2017,2,238.095238,90.414954,435.535714
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1432449,5,WOODROW,02 TWO FAMILY DWELLINGS,65 ROBIN COURT,10309,2160,685000,2017-08-18,2017,8,317.129630,142.045455,562.333960
1432450,5,WOODROW,02 TWO FAMILY DWELLINGS,86 ROBIN COURT,10309,2500,627500,2017-12-18,2017,12,251.000000,142.045455,562.333960
1432451,5,WOODROW,02 TWO FAMILY DWELLINGS,23 QUAIL LANE,10309,2575,690000,2017-09-14,2017,9,267.961165,142.045455,562.333960
1432452,5,WOODROW,02 TWO FAMILY DWELLINGS,32 PHEASANT LANE,10309,2377,550000,2017-04-21,2017,4,231.384098,142.045455,562.333960


In [None]:
# checking for null values
na = city_df.set_index(["Borough"]).isna().sum(level=0)
na.head()

Unnamed: 0_level_0,Neighborhood,Building_Class_Category,Address,Apartment_Number,Zip_Code,Gross_Square_Feet,Sale_Price,Sale_Date,Year,Month,$sqf,Lower_Quant,Upper_Quant
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
city_df.reset_index(inplace=True)


In [None]:
address_df = city_df[["Address", "Building_Class_Category", "Borough", "Neighborhood", 
                     "Zip_Code"]]
sale_df = city_df[["Gross_Square_Feet", "Sale_Price", "Sale_Date", "Year"]]

In [None]:
# writing tables to sql/csv
address_df.to_sql(name='Address', con=engine, method='multi', if_exists='replace')
#sale_df.to_csv("NY_Propety_Sales_Table.csv")
sale_df.to_sql(name='Sales', con=engine, method='multi', if_exists='replace')
#address_df.to_csv("NY_Property_Address_Table.csv")


In [None]:
pip install hvplot 



In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import required libraries and dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import hvplot.pandas
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path
from collections import Counter
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func

In [None]:
username="postgres"
password="BrAcRfSky6syf8M"
db_string = f"postgresql://{username}:{password}@database-group8.cinukpl9yba8.us-east-2.rds.amazonaws.com"

In [None]:
engine = create_engine(db_string)

In [None]:
sales_join_address = pd.read_sql('SELECT * FROM public."Sales_join_Address"',engine)

In [None]:
sales_join_address.columns

Index(['index', 'Gross_Square_Feet', 'Sale_Price', 'Sale_Date', 'Year',
       'Address', 'Building_Class_Category', 'Borough', 'Neighborhood',
       'Zip_Code'],
      dtype='object')

In [None]:
columns = ["Neiborhood", "Building_Class_Category", "Address", "Zip_Code", "Residential_Units", "Gross_Square_Feet", "Year_Built", "Sale_Date", "Borough"]
Target = ["Sale_Price"]

In [None]:
city_df.columns

Index(['index', 'Borough', 'Neighborhood', 'Building_Class_Category',
       'Address', 'Apartment_Number', 'Zip_Code', 'Gross_Square_Feet',
       'Sale_Price', 'Sale_Date', 'Year', 'Month', '$sqf', 'Lower_Quant',
       'Upper_Quant'],
      dtype='object')

In [None]:
sns.pairplot(city_df)

<seaborn.axisgrid.PairGrid at 0x7f6ac671e810>

In [None]:
city_df.hvplot.hist(by='Sale_Price', subplots=False, width=1000)

In [None]:
city_df.hvplot.hist("Sale_Price")

In [None]:
city_df.hvplot.scatter(x='Gross_Square_Feet', y='Sale_Price')

In [None]:
sns.heatmap(city_df.corr(), annot=True)

In [None]:
city_df["Sale_Date"] = pd.to_datetime(city_df["Sale_Date"], errors='coerce')
city_df.head()

In [None]:
city_df["Sale_Date"] = city_df["Sale_Date"].dt.strftime("%Y%m")
city_df.head()

In [None]:
city_df.Neighborhood.unique()

In [None]:
city_df.Neighborhood.value_counts()

In [None]:
### Create our target
y = city_df['Sale_Price']

# Create our features
X = city_df.drop(columns='Sale_Price')

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
# use label encoder to change Neighborhood from string to integer
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
n_enc = le.fit(city_df.Neighborhood)
new_n = n_enc.transform(city_df.Neighborhood)
new_n

In [None]:
n_enc.inverse_transform(new_n)

In [None]:
city_df["enc_neighborhood"] = new_n
city_df

In [None]:
city_df["Building_Class_Category"].value_counts()

In [None]:
b_enc = le.fit(city_df["Building_Class_Category"])
new_b = b_enc.transform(city_df["Building_Class_Category"])
new_b

In [None]:
b_enc.inverse_transform(new_b)

In [None]:
city_df["enc_Building Class Category"] = new_b
city_df

In [None]:
le = LabelEncoder()
a_enc = le.fit(city_df.Address)
new_a = a_enc.transform(city_df.Address)
new_a

In [None]:
a_enc.inverse_transform(new_a)

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder().fit(city_df[["Neighborhood", "Building_Class_Category", "Address"]])
enc_df = enc.transform(city_df[["Neighborhood", "Building_Class_Category", "Address"]])
enc_df

In [None]:
city_df.corr()

In [None]:
city_df.describe()

In [None]:
plt.scatter(city_df.Gross_Square_Feet, city_df.Sale_Price, c=city_df.enc_neighborhood)
plt.xlabel('Gross_Square_Feet')
plt.ylabel('Sale_Price')
plt.show()

In [None]:
X = city_df.drop(columns = ["Neighborhood", "Building_Class_Category", "Sale_Price", "Address", "index", "$sqf", "Lower_Quant", "Upper_Quant", "Apartment_Number"])

In [None]:
X.shape

In [None]:
y = city_df.Sale_Price

In [None]:
X

In [None]:
402900-(402900*.2)
x_train = X.iloc[0:322320]
x_test = X.iloc[322320:]
x_test

In [None]:
y_train = y.iloc[0:322320]
y_test = y.iloc[322320:]

In [None]:
city_df["Current_Price_Predication"] = pd.concat([train_pred, test_pred])
city_df

In [None]:
pip install sklearn

In [None]:
pip install -U scikit-learn

In [None]:
pip install standard-scaler

In [None]:
pip install sts-pylib

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn import preprocessing


In [None]:
sts = StandardScaler()
x_scale = sts.fit(x_train)
x_train_scaled = x_scale.transform(x_train)
x_test_scaled = x_scale.transform(x_test)

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train_scaled, y_train)

In [None]:
y_pred = model.predict(x_test_scaled)
print(y_pred.shape)
model.score(x_test, y_test)

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(x_train,y_train)

In [None]:
reg.score(X, y)

In [None]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')

In [None]:
clf.fit(x_train, y_train)

In [None]:
clf.score(x_test,y_test)

In [None]:
X = city_df[['Gross_Square_Feet', 'Year', 'enc_neighborhood',
               'enc_Building Class Category']]
y = city_df['Sale_Price']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('std_scalar', StandardScaler())
])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train,y_train)

In [None]:
# print the intercept
print(lin_reg.intercept_)

In [None]:
coeff_df = pd.DataFrame(lin_reg.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
pred = lin_reg.predict(X_test)

In [None]:
pd.DataFrame({'True Values': y_test, 'Predicted Values': pred}).hvplot.scatter(x='True Values', y='Predicted Values')

In [None]:
pd.DataFrame({'Error Values': (y_test - pred)}).hvplot.kde()

In [None]:
test_pred = lin_reg.predict(X_test)
train_pred = lin_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
results_df = pd.DataFrame(data=[["Linear Regression", *evaluate(y_test, test_pred) , cross_val(LinearRegression())]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df

In [None]:
from sklearn.linear_model import RANSACRegressor

model = RANSACRegressor(base_estimator=LinearRegression(), max_trials=100)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
results_df_2 = pd.DataFrame(data=[["Robust Regression", *evaluate(y_test, test_pred) , cross_val(RANSACRegressor())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

In [None]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=100, solver='cholesky', tol=0.0001, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
results_df_2 = pd.DataFrame(data=[["Ridge Regression", *evaluate(y_test, test_pred) , cross_val(Ridge())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

In [None]:
from sklearn.linear_model import Lasso

model = Lasso(alpha=0.1, 
              precompute=True, 
#               warm_start=True, 
              positive=True, 
              selection='random',
              random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
results_df_2 = pd.DataFrame(data=[["Lasso Regression", *evaluate(y_test, test_pred) , cross_val(Lasso())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

In [None]:
from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=0.1, l1_ratio=0.9, selection='random', random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
results_df_2 = pd.DataFrame(data=[["Elastic Net Regression", *evaluate(y_test, test_pred) , cross_val(ElasticNet())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree=2)

X_train_2_d = poly_reg.fit_transform(X_train)
X_test_2_d = poly_reg.transform(X_test)

lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train_2_d,y_train)

test_pred = lin_reg.predict(X_test_2_d)
train_pred = lin_reg.predict(X_train_2_d)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
city_df["Prediction"] = pd.concat([train_pred, test_pred])
city_df

In [None]:
results_df_2 = pd.DataFrame(data=[["Polynomail Regression", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

In [None]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(n_iter_no_change=250, penalty=None, eta0=0.0001, max_iter=100000)
sgd_reg.fit(X_train, y_train)

test_pred = sgd_reg.predict(X_test)
train_pred = sgd_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
results_df_2 = pd.DataFrame(data=[["Stochastic Gradient Descent", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

In [None]:
pd.DataFrame({'True Values': y_test, 'Predicted Values': pred}).hvplot.scatter(x='True Values', y='Predicted Values')

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='rbf', C=1000000, epsilon=0.001)
svm_reg.fit(X_train, y_train)

test_pred = svm_reg.predict(X_test)
train_pred = svm_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)

print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
results_df_2 = pd.DataFrame(data=[["SVM Regressor", *evaluate(y_test, test_pred), 0]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', 'Cross Validation'])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

In [None]:
results_df.set_index('Model', inplace=True)
results_df['R2 Square'].plot(kind='barh', figsize=(12, 8))

In [None]:
import pickle

In [None]:
# save the model 
filename = 'keras_finalized_model.sav'
pickle.dump(keras, open(keras_finalized_model.sav, 'wb'))

In [None]:
# Save Model Using joblib
import joblib
filename = 'linear_finalized_model_joblib.sav'
joblib.dump(linear_regression, linear_finalized_model_joblib.sav)