In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import os

### Load the Data

In [46]:
#relative path
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
data_path = os.path.join(parent_dir, 'Data', 'train_clean.csv')
test_path = os.path.join(parent_dir, 'Data', 'test_clean.csv')

# Load data and test data
data = pd.read_csv(data_path, index_col = 0)
test = pd.read_csv(test_path, index_col = 0)

# Separate data into X and Y
y = data.SalePrice
X = data.drop("SalePrice", axis = 1)

# Split the data into training and validation set
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size = 0.2, random_state=2024)

### Preprocess (Imputation)

In [54]:
# Preprocessing
from sklearn.impute import SimpleImputer
SimImputer = SimpleImputer(strategy="constant", fill_value=0)

# Impute with average value in each columns
train_X_final = pd.DataFrame(SimImputer.fit_transform(train_X))
val_X_final = pd.DataFrame(SimImputer.fit_transform(val_X))

# Put back column names
train_X_final.columns = train_X.columns
val_X_final.columns = val_X.columns

### Linear Regression

In [55]:
# Create a Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(train_X_final, train_y)

# Make predictions on the testing data
y_pred = model.predict(val_X_final)

# Evaluate the model
mse = mean_absolute_error(val_y, y_pred)
print(f"mse: {mse}")

mse: 21254.789757601447


### Generate Test Prediction

In [56]:
# Test set imputation
# Impute with average value in each columns
test_final = pd.DataFrame(SimImputer.fit_transform(test))

# Put back column names
test_final.columns = test.columns

In [57]:
test_pred = model.predict(test_final)

output = pd.DataFrame({'Id': test.index,
                       'SalePrice': test_pred})
output.to_csv('submission.csv', index=False)