In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load the dataset using the given file path
zip_sea = pd.read_csv('inputs/zip_sea.csv')

# Remove rows with missing values in 'Price' column
zip_sea = zip_sea.dropna(subset=['Price'])

# Create the model formula
formula = 'Price ~ GMSL_noGIA * Q("Inland/Coastal") + C(Pair)'

# Split the dataset into training and testing sets
train, test = train_test_split(zip_sea, test_size=0.2, random_state=42)

# Create and fit the model using the training set
model = smf.ols(formula, data=train).fit()

# Evaluate the model using the testing set and calculate the R-squared score
test['PredPrice'] = model.predict(test)
r_score = r2_score(test['Price'], test['PredPrice'])
print('R-Squared Score: {:.3f}'.format(r_score))

# Save the predictions to a file called predict.csv in the outputs folder
test.to_csv('outputs/predict.csv', index=False)

# Output the model summary
print(model.summary())

R-Squared Score: 0.821
                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.887
Model:                            OLS   Adj. R-squared:                  0.886
Method:                 Least Squares   F-statistic:                     738.6
Date:                Tue, 02 May 2023   Prob (F-statistic):               0.00
Time:                        19:13:42   Log-Likelihood:                -50957.
No. Observations:                3791   AIC:                         1.020e+05
Df Residuals:                    3750   BIC:                         1.023e+05
Df Model:                          40                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Load the dataset using the given file path
zip_sea = pd.read_csv('inputs/zip_sea.csv')

# Remove rows with missing values in 'Price' column
zip_sea = zip_sea.dropna(subset=['Price'])

# Split the dataset into training and testing sets
train, test = train_test_split(zip_sea, test_size=0.2, random_state=42)

# Define the preprocessor for categorical and continuous features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['GMSL_noGIA']),
        ('cat', OneHotEncoder(drop='first'), ['Inland/Coastal', 'Pair'])
    ])

# Create the Lasso model
lasso_model = Lasso(alpha=1.0, random_state=42)

# Create a pipeline with the preprocessor and the Lasso model
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', lasso_model)])

# Fit the pipeline using the training set
X_train = train.drop('Price', axis=1)
y_train = train['Price']
pipe.fit(X_train, y_train)

# Evaluate the model using the testing set and calculate the R-squared score
X_test = test.drop('Price', axis=1)
y_test = test['Price']
test['PredPrice2'] = pipe.predict(X_test)
r_score = r2_score(y_test, test['PredPrice2'])
print('R-Squared Score: {:.3f}'.format(r_score))

# Save the predictions to a file called predict.csv in the outputs folder
test.to_csv('outputs/predict2.csv', index=False)

# Output the model coefficients
print('Lasso coefficients:')
for col, coef in zip(pipe.named_steps['preprocessor'].get_feature_names_out(), pipe.named_steps['model'].coef_):
    print('{}: {:.3f}'.format(col, coef))

# Calculate the adjusted R-squared
n = len(y_test)
p = len(pipe.named_steps['preprocessor'].get_feature_names_out())
adj_r2 = 1 - (1 - r_score) * (n - 1) / (n - p - 1)
print('Adjusted R-Squared Score: {:.3f}'.format(adj_r2))

R-Squared Score: 0.821
Lasso coefficients:
num__GMSL_noGIA: 12908.905
cat__Inland/Coastal_1: -31747.331
cat__Pair_2: -185556.493
cat__Pair_3: 6435.231
cat__Pair_4: 416644.087
cat__Pair_5: 506214.833
cat__Pair_6: 2771893.272
cat__Pair_7: -58563.409
cat__Pair_8: -7370.090
cat__Pair_9: -43153.045
cat__Pair_10: -68350.641
cat__Pair_11: -75242.546
cat__Pair_12: 147426.561
cat__Pair_13: 171846.136
cat__Pair_14: 165718.474
cat__Pair_15: 31824.994
cat__Pair_16: 72243.448
cat__Pair_17: -60963.862
cat__Pair_18: -71852.354
cat__Pair_19: 128558.225
cat__Pair_20: -17540.888
cat__Pair_21: -52054.301
cat__Pair_22: -145261.013
cat__Pair_23: 30298.935
cat__Pair_24: -95527.812
cat__Pair_25: -69900.003
cat__Pair_26: -84050.314
cat__Pair_27: -139371.155
cat__Pair_28: 192650.172
cat__Pair_29: 22039.136
cat__Pair_30: -67999.628
cat__Pair_31: 169789.735
cat__Pair_32: 160483.541
cat__Pair_33: 387827.952
cat__Pair_34: 2141562.718
cat__Pair_35: 2146731.438
cat__Pair_36: -69401.473
cat__Pair_37: -56127.588
cat__

In [8]:
import csv

# Read data from predict.csv
with open('outputs/predict.csv', 'r') as csvfile1:
    csvreader1 = csv.reader(csvfile1)
    predict_data = [row for row in csvreader1]

# Read data from predict2.csv
with open('outputs/predict2.csv', 'r') as csvfile2:
    csvreader2 = csv.reader(csvfile2)
    predict2_data = [row for row in csvreader2]

# Combine the last column of predict2.csv into predict.csv
for i in range(len(predict_data)):
    if i == 0:
        predict_data[i].append("PredPrice2")
    else:
        predict_data[i].append(predict2_data[i][-1])

# Write the combined data to a new CSV file
with open('outputs/combined_predict.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerows(predict_data)

print("Combined data saved to 'combined_predict.csv'")

Combined data saved to 'combined_predict.csv'
