In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


In [None]:
# Load the data
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
# Ensure that the 'Id' column is treated as a unique identifier
train_data.set_index('Id', inplace=True)
test_data.set_index('Id', inplace=True)

# Feature engineering: Create the 'totalnumbdrm' column
train_data['totalnumbdrm'] = train_data[['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath']].sum(axis=1)
test_data['totalnumbdrm'] = test_data[['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath']].sum(axis=1)


In [None]:
# Separate features and target variable from training data
X = train_data[['LotArea', 'BedroomAbvGr', 'totalnumbdrm']]
y = train_data['SalePrice']

In [None]:
# Separate features and target variable from training data
X = train_data[['LotArea', 'BedroomAbvGr', 'totalnumbdrm']]
y = train_data['SalePrice']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_val)

# Evaluate the Linear Regression model
lr_mse = mean_squared_error(y_val, lr_pred)
lr_r2 = r2_score(y_val, lr_pred)

print(f'Linear Regression Mean Squared Error: {lr_mse}')
print(f'Linear Regression R-squared: {lr_r2}')

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_val)

# Evaluate the Random Forest model
rf_mse = mean_squared_error(y_val, rf_pred)
rf_r2 = r2_score(y_val, rf_pred)

print(f'Random Forest Mean Squared Error: {rf_mse}')
print(f'Random Forest R-squared: {rf_r2}')

# Predict on the test set with both models
X_test = test_data[['LotArea', 'BedroomAbvGr', 'totalnumbdrm']]
lr_test_predictions = lr_model.predict(X_test)
rf_test_predictions = rf_model.predict(X_test)

# Save the test predictions to CSV files
test_data['LinearRegression_SalePrice'] = lr_test_predictions
test_data['RandomForest_SalePrice'] = rf_test_predictions
test_data[['LinearRegression_SalePrice', 'RandomForest_SalePrice']].to_csv('test_predictions.csv')


In [None]:
# Visualization 1: Distribution of 'SalePrice'
plt.figure(figsize=(10, 6))
sns.histplot(train_data['SalePrice'], kde=True)
plt.title('Distribution of Sale Price')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.show()

# Visualization 2: Correlation Matrix
plt.figure(figsize=(10, 8))
corr_matrix = train_data[['SalePrice', 'LotArea', 'BedroomAbvGr', 'totalnumbdrm']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Visualization 3: Linear Regression - Predictions vs Actuals
plt.figure(figsize=(10, 6))
plt.scatter(y_val, lr_pred, alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'k--', lw=3)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Linear Regression: Predicted vs Actual Sale Price')
plt.show()

# Visualization 4: Random Forest - Predictions vs Actuals
plt.figure(figsize=(10, 6))
plt.scatter(y_val, rf_pred, alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'k--', lw=3)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest: Predicted vs Actual Sale Price')
plt.show()

# Visualization 5: Linear Regression Residuals
lr_residuals = y_val - lr_pred
plt.figure(figsize=(10, 6))
sns.histplot(lr_residuals, kde=True)
plt.title('Linear Regression: Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

# Visualization 6: Random Forest Residuals
rf_residuals = y_val - rf_pred
plt.figure(figsize=(10, 6))
sns.histplot(rf_residuals, kde=True)
plt.title('Random Forest: Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()