In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')


In [2]:
# Check for missing values in the train dataset
missing_values_train = train.isnull().sum()

# Check for missing values in the test dataset
missing_values_test = test.isnull().sum()

print(missing_values_train[missing_values_train > 0])
print(missing_values_test[missing_values_test > 0])


LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType       894
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78


In [3]:
# Example: Fill missing values for LotFrontage with the median value
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].median())
test['LotFrontage'] = test['LotFrontage'].fillna(test['LotFrontage'].median())

# Example: Fill categorical missing values with the mode
train['MSZoning'] = train['MSZoning'].fillna(train['MSZoning'].mode()[0])
test['MSZoning'] = test['MSZoning'].fillna(test['MSZoning'].mode()[0])

# Fill remaining missing values
for column in train.columns:
    if train[column].dtype == 'object':
        train[column] = train[column].fillna('None')
    else:
        train[column] = train[column].fillna(0)

for column in test.columns:
    if test[column].dtype == 'object':
        test[column] = test[column].fillna('None')
    else:
        test[column] = test[column].fillna(0)


In [4]:
# Example: Total square footage
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

# Example: Age of the house
train['Age'] = train['YrSold'] - train['YearBuilt']
test['Age'] = test['YrSold'] - test['YearBuilt']


In [5]:
# Combine train and test data for consistent encoding
all_data = pd.concat([train, test], ignore_index=True)

# One-hot encoding for categorical variables
all_data = pd.get_dummies(all_data)

# Split the data back into train and test sets
train = all_data[:len(train)]
test = all_data[len(train):]


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Columns to scale
num_cols = ['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF', 'Age', '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF']

# Scale the numerical features in train and test sets
train.loc[:, num_cols] = scaler.fit_transform(train[num_cols])
test.loc[:, num_cols] = scaler.transform(test[num_cols])


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define target variable and features
X = train.drop(['SalePrice'], axis=1)
y = train['SalePrice']

# Split into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_valid)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')


Validation RMSE: 29576.789445184764


In [11]:
# List of features used for training (excluding the target variable 'SalePrice')
features = train.drop(columns=['SalePrice']).columns

# Ensure test set contains the same features
test_features = test[features]

# Predict on the test set
test.loc[:, 'SalePrice'] = model.predict(test_features)

# Prepare the submission file
submission = test[['Id', 'SalePrice']]
submission.to_csv('submission.csv', index=False)
