In [3]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error

warnings.filterwarnings('ignore')

training_data = pd.read_csv("my_train.csv")
validation_data = pd.read_csv("my_dev.csv")

target_train = training_data['SalePrice']
features_train = training_data.drop(['Id', 'SalePrice'], axis=1)


target_validation = validation_data['SalePrice']
features_validation = validation_data.drop(['Id', 'SalePrice'], axis=1)


train_fill_columns = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
features_train[train_fill_columns] = features_train[train_fill_columns].fillna(0)


validation_fill_columns = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
features_validation[validation_fill_columns] = features_validation[validation_fill_columns].fillna(0)

numerical_features = list(features_train.select_dtypes(include=[np.number]).columns)
categorical_features = list(features_train.select_dtypes(exclude=[np.number]).columns)

column_to_convert = 'MSSubClass'
if column_to_convert in numerical_features:
    numerical_features.remove(column_to_convert)
    categorical_features.append(column_to_convert)

features_train[categorical_features] = features_train[categorical_features].astype(str)
features_validation[categorical_features] = features_validation[categorical_features].astype(str)


numerical_processor = 'passthrough'
categorical_processor = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


preprocessor = ColumnTransformer([
    ('num', numerical_processor, numerical_features),
    ('cat', categorical_processor, categorical_features)
])


preprocessor.fit(features_train)
processed_features_train = preprocessor.transform(features_train)
processed_features_validation = preprocessor.transform(features_validation)

X_train = processed_features_train
y_train = target_train

X_validation = processed_features_validation
y_validation = target_validation

important_features = [
    'EnclosedPorch', 'GarageYrBlt', 'KitchenAbvGr', 'BsmtHalfBath', 'MSSubClass',
    'WoodDeckSF', 'GarageArea', 'BsmtFullBath', '1stFlrSF', 'BsmtFinSF1', 
    'YearRemodAdd', 'YearBuilt', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond'
]
important_numerical_features = [col for col in numerical_features if col in important_features]

poly_features = PolynomialFeatures(degree=2, include_bias=False)
train_poly = poly_features.fit_transform(features_train[important_numerical_features])
validation_poly = poly_features.transform(features_validation[important_numerical_features])

X_train_poly = np.concatenate((X_train, train_poly), axis=1)
X_validation_poly = np.concatenate((X_validation, validation_poly), axis=1)

log_target_train = np.log(y_train)  # Apply log transformation to target

ridge_alpha = 20
ridge_model = Ridge(alpha=ridge_alpha)
ridge_model.fit(X_train_poly, log_target_train)

log_predictions_validation = ridge_model.predict(X_validation_poly)
final_predictions = np.exp(log_predictions_validation)  # Convert back from log scale

rmsle_score = np.sqrt(mean_squared_log_error(y_validation, final_predictions))



In [4]:
rmsle_score

0.11223079123201221

In [13]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


train_data = pd.read_csv("my_train.csv", sep = ',')
test_set = pd.read_csv("test.csv", sep = ',')


labels = train_data['SalePrice']
train_features = train_data.drop(['Id','SalePrice'],axis = 1)


test_features = test_set.drop(['Id'],axis = 1)


columns_to_replace_train_features = ['LotFrontage', 'MasVnrArea','GarageYrBlt']
train_features[columns_to_replace_train_features] = train_features[columns_to_replace_train_features].fillna(0)

columns_to_replace_test = ['LotFrontage', 'MasVnrArea','BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath','BsmtHalfBath','GarageCars','GarageArea','GarageYrBlt']
test_features[columns_to_replace_test] = test_features[columns_to_replace_test].fillna(0)

numeric_columns = list(train_features.select_dtypes(include=[np.number]).columns)
categorical_columns = list(train_features.select_dtypes(exclude=[np.number]).columns)


column_to_move = 'MSSubClass'  
if column_to_move in numeric_columns:
    numeric_columns.remove(column_to_move)
    categorical_columns.append(column_to_move)
else:
    print("Column not found in numeric_columns list.")

train_features[categorical_columns] = train_features[categorical_columns].astype(str)
test_features[categorical_columns] = test_features[categorical_columns].astype(str)

num_processor = 'passthrough' # i.e., no transformation
cat_processor = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


preprocessor = ColumnTransformer([
('num', num_processor, numeric_columns),
('cat', cat_processor, categorical_columns)])
preprocessor.fit(train_features)
train_processed_data = preprocessor.transform(train_features)
test_processed_data = preprocessor.transform(test_features)

X_train = train_processed_data
y_train = labels


X_test = test_processed_data

important_features = ['GarageArea','KitchenAbvGr','BsmtFullBath','1stFlrSF','BsmtFinSF1','YearRemodAdd','YearBuilt','LotFrontage','LotArea','OverallQual','OverallCond']  
important_numeric_columns = [col for col in numeric_columns if col in important_features]

poly = PolynomialFeatures(degree=2, include_bias=False)
train_poly_features = poly.fit_transform(train_features[important_numeric_columns])
test_poly_features = poly.transform(test_features[important_numeric_columns])


X_train_poly = np.concatenate((X_train, train_poly_features), axis=1)
X_test_poly = np.concatenate((X_test, test_poly_features), axis=1)

log_of_label = np.log(y_train)

alpha = 20  
ridge_model = Ridge(alpha=alpha)
ridge_model.fit(X_train_poly, log_of_label)

log_of_test_label = ridge_model.predict(X_test_poly)
final_price_predictions = np.exp(log_of_test_label)

submission_df = pd.DataFrame({'Id': test_set['Id'], 'SalePrice': final_price_predictions})

submission_df.to_csv('C:/Users/badhe/Downloads/hw3-data/test_submission.csv', index=False)
