In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the datasets
train_df = pd.read_csv('Datasets/Raw/train.csv')
test_df = pd.read_csv('Datasets/Raw/test.csv')

# Save the test set's 'Id' column for the final submission
test_ids = test_df['Id']

In [3]:
def clean_data(df):
    """Fills missing values in the DataFrame."""
    
    # Impute specific categorical features with 'None'
    for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu', 
                'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
                'BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 
                'BsmtFinType1', 'MSZoning', 'Utilities', 'Exterior1st', 
                'Exterior2nd', 'KitchenQual', 'Functional', 'SaleType'):
        df[col] = df[col].fillna('None')

    # Impute numerical features with 0
    for col in ('GarageYrBlt', 'MasVnrArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', 
                'BsmtUnfSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath'):
        df[col] = df[col].fillna(0)

    # Fill LotFrontage with the median
    df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())
    
    # Fill Electrical with the mode
    df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
    
    # Drop the Id column if it exists
    if 'Id' in df.columns:
        df = df.drop(columns=['Id'], axis=1)

    return df

# Apply the cleaning function to both datasets
train_df_cleaned = clean_data(train_df.copy())
test_df_cleaned = clean_data(test_df.copy())

In [4]:
# TRY..................


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Combine train and test for consistent one-hot encoding
X = train_df_cleaned.drop('SalePrice', axis=1)
y = train_df_cleaned['SalePrice']
combined_df = pd.concat([X, test_df_cleaned], axis=0, ignore_index=True)
combined_df = pd.get_dummies(combined_df)

# Separate back into processed train and test sets
X_processed = combined_df.iloc[:len(X), :]
test_processed = combined_df.iloc[len(X):, :]

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# --- Apply Feature Engineering and Scaling ---
# Create a new feature for total square footage
X_train['TotalSF'] = X_train['GrLivArea'] + X_train['TotalBsmtSF']
X_val['TotalSF'] = X_val['GrLivArea'] + X_val['TotalBsmtSF']
test_processed['TotalSF'] = test_processed['GrLivArea'] + test_processed['TotalBsmtSF']

# Apply log transformation to the target variable
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Scale numerical features
scaler = StandardScaler()
numerical_cols = X_train.select_dtypes(include=np.number).columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
test_processed[numerical_cols] = scaler.transform(test_processed[numerical_cols])

# --- Model Building, Prediction & Submission ---
model = LinearRegression()
model.fit(X_train, y_train_log)

# Make predictions on the validation set for evaluation
y_val_pred = model.predict(X_val)
y_val_pred_unscaled = np.expm1(y_val_pred)
y_val_unscaled = y_val

rmse = np.sqrt(mean_squared_error(y_val_unscaled, y_val_pred_unscaled))
r2 = r2_score(y_val_unscaled, y_val_pred_unscaled)

print(f"Validation RMSE: {rmse:.2f}")
print(f"Validation R-squared: {r2:.2f}")

# Make predictions on the test set for final submission
y_test_pred = model.predict(test_processed)
predictions_final = np.expm1(y_test_pred)

# Create the submission DataFrame
submission_df = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions_final})
submission_df.to_csv('house_prices_submission.csv', index=False)

print("\nSubmission file 'house_prices_submission.csv' created successfully!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_processed['TotalSF'] = test_processed['GrLivArea'] + test_processed['TotalBsmtSF']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_processed[numerical_cols] = scaler.transform(test_processed[numerical_cols])


Validation RMSE: 22908.80
Validation R-squared: 0.93

Submission file 'house_prices_submission.csv' created successfully!
