# QUESTION 5
### Parsa Daghigh
### Std num: 810101419


### Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

### Load train and test

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

### Part a
### How to handle missing data?

1. **Deleting Rows or Columns Containing Missing Data:**
   - Simple to implement and avoids introducing bias in the data.
   - Useful information may be lost.

2. **Replacing with Mean or Median Value:**
   -  Easy to implement and retains all data.
   -  May reduce model accuracy as it decreases data variability.

3. **Replacing with Predictions from Machine Learning Models:**
   -  Preserves data variability and increases model accuracy.
   -  More complex to implement and requires more computations.

4. **Using Methods Based on most frequency:**
   -  Can reduce data variability and decrease model accuracy.

### Part B , C , D

In [None]:
missing_percent = train_data.isnull().mean() * 100

missing_percent_non_zero = missing_percent[missing_percent > 0]

print("Features with missing values and their percentages:")
print(missing_percent_non_zero)

Features with missing values and their percentages:
LotFrontage     18.590998
Alley           93.542074
MasVnrType      57.729941
MasVnrArea       0.293542
BsmtQual         2.544031
BsmtCond         2.544031
BsmtExposure     2.544031
BsmtFinType1     2.544031
BsmtFinType2     2.544031
Electrical       0.097847
FireplaceQu     47.651663
GarageType       5.283757
GarageYrBlt      5.283757
GarageFinish     5.283757
GarageQual       5.283757
GarageCond       5.283757
PoolQC          99.510763
Fence           80.234834
MiscFeature     96.086106
dtype: float64


In [None]:
threshold = 50

# Filter out features with missing values above the threshold
features_to_drop = missing_percent[missing_percent > threshold].index
train_data = train_data.drop(columns=features_to_drop)
test_data = test_data.drop(columns=features_to_drop)

print("Features dropped due to high percentage of missing values:")
print(features_to_drop)

Features dropped due to high percentage of missing values:
Index(['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')


In [None]:
numeric_data_train = train_data.select_dtypes(include=['number'])
categorical_data_train = train_data.select_dtypes(include=['object'])

numeric_data_test = test_data.select_dtypes(include=['number'])
categorical_data_test = test_data.select_dtypes(include=['object'])

# Impute numeric data with mean
imputer_mean = SimpleImputer(strategy='mean')
numeric_data_train_imputed = pd.DataFrame(imputer_mean.fit_transform(numeric_data_train), columns=numeric_data_train.columns)
numeric_data_test_imputed = pd.DataFrame(imputer_mean.transform(numeric_data_test), columns=numeric_data_test.columns)

# Impute categorical data with mode
imputer_mode = SimpleImputer(strategy='most_frequent')
categorical_data_train_imputed = pd.DataFrame(imputer_mode.fit_transform(categorical_data_train), columns=categorical_data_train.columns)
categorical_data_test_imputed = pd.DataFrame(imputer_mode.transform(categorical_data_test), columns=categorical_data_test.columns)

# One-hot encode categorical data
encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

combined_categorical_data = pd.concat([categorical_data_train_imputed, categorical_data_test_imputed], axis=0)
encoder.fit(combined_categorical_data)

categorical_encoded_train = encoder.transform(categorical_data_train_imputed)
categorical_encoded_test = encoder.transform(categorical_data_test_imputed)

categorical_encoded_train_df = pd.DataFrame(categorical_encoded_train, columns=encoder.get_feature_names_out(categorical_data_train_imputed.columns))
categorical_encoded_test_df = pd.DataFrame(categorical_encoded_test, columns=encoder.get_feature_names_out(categorical_data_test_imputed.columns))

train_data_imputed = pd.concat([numeric_data_train_imputed, categorical_encoded_train_df], axis=1)
test_data_imputed = pd.concat([numeric_data_test_imputed, categorical_encoded_test_df], axis=1)

print("Imputation and encoding completed successfully.")

Imputation and encoding completed successfully.


In [None]:
# Standardize numeric features
scaler = StandardScaler()
scaled_features_train = scaler.fit_transform(numeric_data_train_imputed)
scaled_features_test = scaler.transform(numeric_data_test_imputed)

processed_data_train = pd.concat([pd.DataFrame(scaled_features_train, columns=numeric_data_train_imputed.columns), categorical_encoded_train_df.reset_index(drop=True)], axis=1)
processed_data_test = pd.concat([pd.DataFrame(scaled_features_test, columns=numeric_data_test_imputed.columns), categorical_encoded_test_df.reset_index(drop=True)], axis=1)

print("Normalization and encoding completed successfully.")

Normalization and encoding completed successfully.


### Part E
### Create MLP model

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.drop1 = nn.Dropout(p = 0.1)
        self.fc2 = nn.Linear(128, 64)
        self.drop2 = nn.Dropout(p = 0.1)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.drop1(x)
        x = torch.relu(self.fc2(x))
        x = self.drop2(x)
        x = self.fc3(x)
        return x

X_train_tensor = torch.tensor(processed_data_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(train_data['SalePrice'].values, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(processed_data_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(test_data['SalePrice'].values, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

input_dim = processed_data_train.shape[1]
model = MLP(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 500
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [10/500], Loss: 33210914816.0000
Epoch [20/500], Loss: 1793515136.0000
Epoch [30/500], Loss: 542255104.0000
Epoch [40/500], Loss: 689383680.0000
Epoch [50/500], Loss: 469448992.0000
Epoch [60/500], Loss: 609746304.0000
Epoch [70/500], Loss: 317065216.0000
Epoch [80/500], Loss: 1000863808.0000
Epoch [90/500], Loss: 667606656.0000
Epoch [100/500], Loss: 294871776.0000
Epoch [110/500], Loss: 387219904.0000
Epoch [120/500], Loss: 425847264.0000
Epoch [130/500], Loss: 466012800.0000
Epoch [140/500], Loss: 212836384.0000
Epoch [150/500], Loss: 1128283776.0000
Epoch [160/500], Loss: 1190421888.0000
Epoch [170/500], Loss: 811154176.0000
Epoch [180/500], Loss: 560931136.0000
Epoch [190/500], Loss: 317669952.0000
Epoch [200/500], Loss: 350498112.0000
Epoch [210/500], Loss: 224013920.0000
Epoch [220/500], Loss: 414451712.0000
Epoch [230/500], Loss: 302182496.0000
Epoch [240/500], Loss: 1056753408.0000
Epoch [250/500], Loss: 230686928.0000
Epoch [260/500], Loss: 384697952.0000
Epoch [270/500

### Part E
#### Evaluate the model

In [None]:
def rmsle(y_true, y_pred):
    log_true = np.log1p(y_true)
    log_pred = np.log1p(y_pred)
    return np.sqrt(mean_squared_error(log_true, log_pred))

model.eval()
with torch.no_grad():
    predictions_train = model(X_train_tensor).numpy()
    predictions_test = model(X_test_tensor).numpy()

rmse_train = np.sqrt(mean_squared_error(y_train_tensor.numpy(), predictions_train))
mae_train = mean_absolute_error(y_train_tensor.numpy(), predictions_train)
mape_train = np.mean(np.abs((y_train_tensor.numpy() - predictions_train) / y_train_tensor.numpy())) * 100
rmsle_train = rmsle(y_train_tensor.numpy(), predictions_train)

rmse_test = np.sqrt(mean_squared_error(y_test_tensor.numpy(), predictions_test))
mae_test = mean_absolute_error(y_test_tensor.numpy(), predictions_test)
mape_test = np.mean(np.abs((y_test_tensor.numpy() - predictions_test) / y_test_tensor.numpy())) * 100
rmsle_test = rmsle(y_test_tensor.numpy(), predictions_test)

print(f'Training RMSE: {rmse_train:.3f}')
print(f'Training MAE: {mae_train:.3f}')
print(f'Training MAPE: {mape_train:.3f}%')
print(f'Training RMSLE: {rmsle_train:.3f}')

print(f'Test RMSE: {rmse_test:.3f}')
print(f'Test MAE: {mae_test:.3f}')
print(f'Test MAPE: {mape_test:.3f}%')
print(f'Test RMSLE: {rmsle_test:.3f}')


Training RMSE: 4891.429
Training MAE: 3762.096
Training MAPE: 2.427%
Training RMSLE: 0.037
Test RMSE: 6138.407
Test MAE: 4664.661
Test MAPE: 3.168%
Test RMSLE: 0.057


### Part F
#### Analysis of Results:

- **Training Set:**
  - **RMSE**: 4715.463
  - **MAE**: 3529.637
  - **MAPE**: 2.269%
  - **RMSLE**: 0.034
  
- **Test Set:**
  - **RMSE**: 6165.644
  - **MAE**: 4615.916
  - **MAPE**: 3.092%
  - **RMSLE**: 0.049

These values indicate that my model has effectively learned from the training data and performs well on the test data. The relatively low differences between the training and test error metrics suggest that your model is generalizing well to unseen data.

#### Overfitting and Underfitting Analysis:
Based on the results, my model shows a good balance between fitting the training data and generalizing to the test data. There are no significant signs of overfitting or underfitting. However, the following suggestions can help to further improve your model's performance:

### Recommendations to Reduce Overfitting or Underfitting:
1. **Cross-Validation**: Use cross-validation to evaluate the model's performance and prevent overfitting.
2. **Gathering More Data**: Collect more data if possible to enhance model training.
3. **Regularization**: Implement regularization techniques such as L1 and L2 regularization.
4. **Model Development**: Experiment with adding or removing layers and neurons to optimize the model architecture.
5. **Ensembling**: Combine multiple models to improve accuracy and stability of predictions.