In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset using the given file path
zip_sea = pd.read_csv('inputs/zip_sea.csv')

# Remove rows with missing values in 'Price' column
zip_sea = zip_sea.dropna(subset=['Price'])

# Split the dataset into coastal and inland subsets
coastal_data = zip_sea[zip_sea['Inland/Coastal'] == 1]
inland_data = zip_sea[zip_sea['Inland/Coastal'] == 0]

# Get the X and y values for both coastal and inland data
X_coastal = coastal_data[['GMSL_noGIA']]
y_coastal = coastal_data['Price']
X_inland = inland_data[['GMSL_noGIA']]
y_inland = inland_data['Price']

# Split the coastal and inland datasets into training and testing sets
X_coastal_train, X_coastal_test, y_coastal_train, y_coastal_test = train_test_split(X_coastal, y_coastal, test_size=0.2, random_state=42)
X_inland_train, X_inland_test, y_inland_train, y_inland_test = train_test_split(X_inland, y_inland, test_size=0.2, random_state=42)

# Create and train the linear regression models for both coastal and inland data
coastal_model = LinearRegression()
inland_model = LinearRegression()

coastal_model.fit(X_coastal_train, y_coastal_train)
inland_model.fit(X_inland_train, y_inland_train)

# Evaluate the models using the testing sets and calculate the mean squared error and R-squared score
y_coastal_pred = coastal_model.predict(X_coastal_test)
y_inland_pred = inland_model.predict(X_inland_test)

mse_coastal = mean_squared_error(y_coastal_test, y_coastal_pred)
mse_inland = mean_squared_error(y_inland_test, y_inland_pred)

r2_coastal = r2_score(y_coastal_test, y_coastal_pred)
r2_inland = r2_score(y_inland_test, y_inland_pred)

print("Coastal Model: MSE =", mse_coastal, "R2 Score =", r2_coastal)
print("Inland Model: MSE =", mse_inland, "R2 Score =", r2_inland)

# Print the coefficients of the models
print("Coastal Model Coefficient:", coastal_model.coef_[0])
print("Inland Model Coefficient:", inland_model.coef_[0])

Coastal Model: MSE = 102638953429.67212 R2 Score = 0.055241377493726596
Inland Model: MSE = 391926714673.39233 R2 Score = 0.01867078612204509
Coastal Model Coefficient: 5548.0718445512775
Inland Model Coefficient: 8015.089114738455


In [2]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset using the given file path
zip_sea = pd.read_csv('inputs/zip_sea.csv')

# Remove rows with missing values in 'Price' column
zip_sea = zip_sea.dropna(subset=['Price'])

# Split the dataset into coastal and inland subsets
coastal_data = zip_sea[zip_sea['Inland/Coastal'] == 1]
inland_data = zip_sea[zip_sea['Inland/Coastal'] == 0]

# Get the X and y values for both coastal and inland data
X_coastal = coastal_data[['GMSL_noGIA']]
y_coastal = coastal_data['Price']
X_inland = inland_data[['GMSL_noGIA']]
y_inland = inland_data['Price']

# Split the coastal and inland datasets into training and testing sets
X_coastal_train, X_coastal_test, y_coastal_train, y_coastal_test = train_test_split(X_coastal, y_coastal, test_size=0.2, random_state=42)
X_inland_train, X_inland_test, y_inland_train, y_inland_test = train_test_split(X_inland, y_inland, test_size=0.2, random_state=42)

# Create and train the linear regression models for both coastal and inland data
coastal_model = LinearRegression()
inland_model = LinearRegression()

coastal_model.fit(X_coastal_train, y_coastal_train)
inland_model.fit(X_inland_train, y_inland_train)

# Evaluate the models using the testing sets and calculate the mean squared error and R-squared score
y_coastal_pred = coastal_model.predict(X_coastal_test)
y_inland_pred = inland_model.predict(X_inland_test)

mse_coastal = mean_squared_error(y_coastal_test, y_coastal_pred)
mse_inland = mean_squared_error(y_inland_test, y_inland_pred)

r2_coastal = r2_score(y_coastal_test, y_coastal_pred)
r2_inland = r2_score(y_inland_test, y_inland_pred)

print("Coastal Model: MSE =", mse_coastal, "R2 Score =", r2_coastal)
print("Inland Model: MSE =", mse_inland, "R2 Score =", r2_inland)

# Output model summaries using statsmodels
coastal_model_sm = sm.OLS(y_coastal_train, sm.add_constant(X_coastal_train)).fit()
inland_model_sm = sm.OLS(y_inland_train, sm.add_constant(X_inland_train)).fit()

print(coastal_model_sm.summary())
print(inland_model_sm.summary())

Coastal Model: MSE = 102638953429.67212 R2 Score = 0.055241377493726596
Inland Model: MSE = 391926714673.39233 R2 Score = 0.01867078612204509
                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.037
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     79.39
Date:                Tue, 02 May 2023   Prob (F-statistic):           1.10e-18
Time:                        16:31:37   Log-Likelihood:                -28994.
No. Observations:                2056   AIC:                         5.799e+04
Df Residuals:                    2054   BIC:                         5.800e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025  

In [3]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset using the given file path
zip_sea = pd.read_csv('inputs/zip_sea.csv')

# Remove rows with missing values in 'Price' column
zip_sea = zip_sea.dropna(subset=['Price'])

# Create dummy variables for the 'Pair' column
pair_dummies = pd.get_dummies(zip_sea['Pair'], prefix='Pair', drop_first=True)
zip_sea = pd.concat([zip_sea, pair_dummies], axis=1)

# Get the X and y values
X = zip_sea[['Inland/Coastal', 'GMSL_noGIA'] + list(pair_dummies.columns)]
y = zip_sea['Price']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model using the testing set and calculate the mean squared error and R-squared score
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model: MSE =", mse, "R2 Score =", r2)

# Output model summary using statsmodels
model_sm = sm.OLS(y_train, sm.add_constant(X_train)).fit()
print(model_sm.summary())

Model: MSE = 43955404998.66471 R2 Score = 0.8209418082930526
                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.887
Model:                            OLS   Adj. R-squared:                  0.886
Method:                 Least Squares   F-statistic:                     756.3
Date:                Tue, 02 May 2023   Prob (F-statistic):               0.00
Time:                        16:34:07   Log-Likelihood:                -50960.
No. Observations:                3791   AIC:                         1.020e+05
Df Residuals:                    3751   BIC:                         1.022e+05
Df Model:                          39                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------

In [12]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split

# Load the dataset using the given file path
zip_sea = pd.read_csv('inputs/zip_sea.csv')

# Remove rows with missing values in 'Price' column
zip_sea = zip_sea.dropna(subset=['Price'])

# Create the model formula
formula = 'Price ~ Date * Q("Inland/Coastal") + C(Pair)'

# Split the dataset into training and testing sets
train, test = train_test_split(zip_sea, test_size=0.2, random_state=42)

# Create and fit the model using the training set
model = smf.ols(formula, data=train).fit()

# Evaluate the model using the testing set and calculate the R-squared score
test['PredPrice'] = model.predict(test)
r2 = sm.tools.eval_measures.r_squared(test['Price'], test['PredPrice'])

print("Model: R2 Score =", r2)

# Output the model summary
print(model.summary())

AttributeError: module 'statsmodels.tools.eval_measures' has no attribute 'r_squared'