In [11]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Loading all datasets individually
train_ds = pd.read_csv('train.csv')
test_ds = pd.read_csv('test.csv')
sample_submission_ds = pd.read_csv('sample_submission.csv')

In [12]:
# Data Preprocessing
# Select relevant features (based on what might affect house prices)
features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath']
target = 'SalePrice'

# Extracting features and target variable from training data
x = train_ds[features]
y = train_ds[target]

# Handling missing values by filling them with the median value of each column
x = x.fillna(x.median())

In [13]:
# Train-Test Split
# Splitting the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

In [14]:
# Train
# Initializing and train the model
model = LinearRegression()
model.fit(x_train, y_train)

# Making predictions on the validation set
y_val_pred = model.predict(x_val)

# Evaluating the model
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 2810942965.2180653
R^2 Score: 0.6335301929422245


In [15]:
# Predicting
# Preprocessing the test data similarly to training data
X_test = test_ds[features]
X_test = X_test.fillna(X_test.median())

# Making predictions on the test set
test_predictions = model.predict(X_test)

# Preparing the submission file
submission_df = sample_submission_ds.copy()
submission_df['SalePrice'] = test_predictions

# Saving the submission file
submission_df.to_csv('submission.csv', index=False)