In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset directly from the Kaggle environment
data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

# Select the relevant features: square footage (GrLivArea), number of bedrooms (BedroomAbvGr), and number of bathrooms (FullBath)
features = data[['GrLivArea', 'BedroomAbvGr', 'FullBath']]
target = data['SalePrice']

# Handle missing data by filling with the median (if any)
features = features.fillna(features.median())

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Print the model coefficients
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')


Mean Squared Error: 2806426667.247853
Coefficients: [   104.02630701 -26655.16535734  30014.32410896]
Intercept: 52261.74862694461
