In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

#models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_squared_error


from sklearn.preprocessing import StandardScaler

from helper import *

In [None]:
df = pd.read_csv('../data/train.csv')
#df.head()

In [None]:
categoric_df = df.select_dtypes(exclude=[np.number])
print(categoric_df.columns)

In [None]:
categoric_df.drop(['MiscFeature'], axis=1, inplace=True)

'MiscFeature' is the feature of the house not covered in other categories but their is a feature called 'MiscVal' which is the value of the 'MiscFeature'. So, we can drop this feature as it is just a description but the value is already accounted for in numeric_df

In [None]:
count_nulls(categoric_df)

There are a lot of nulls for Pools, Alleys, Fences, etc in the dataset. However in the description **NA** means No Pool, No Alley, No Fence, etc. So I will replace certain nulls with descriptions in the notes with (str) **'none'** before onehot encoding.

In [None]:
na_pairings = {'PoolQC' : 'No Pool',
               'Alley' : 'No Alley',
                'Fence' : 'No Fence',
                'FireplaceQu' : 'No Fireplace',
                'GarageType' : 'No Garage',
                'GarageFinish' : 'No Garage',
                'GarageQual' : 'No Garage',
                'GarageCond' : 'No Garage',
                'BsmtExposure' : 'No Basement',
                'BsmtFinType1' : 'No Basement',
                'BsmtFinType2' : 'No Basement',
                'BsmtQual' : 'No Basement',
                'BsmtCond' : 'No Basement',        
                }

#electircal should not be na, drop na electorcal row
categoric_df.dropna(subset=['Electrical'], inplace=True)

In [None]:
#remove masonry veneer type col
categoric_df.drop(['MasVnrType'], axis=1, inplace=True)

In [None]:
#apply one hot encoding
categoric_df = pd.get_dummies(categoric_df, drop_first=True)

In [None]:
categoric_df.head()

In [None]:
count_nulls(categoric_df)

In [None]:
#save to csv
categoric_df['Id'] = df['Id']
categoric_df.to_csv('../new_data/clean_train_categoric.csv', index=False)

In [None]:
categoric_df.drop(['Id'], axis=1, inplace=True)

## Create a model using categorical only features

In [None]:
X = categoric_df.drop(columns=['SalePrice'])
y = categoric_df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_selection import RFE

base_model = RandomForestRegressor(random_state=42) 
rfe = RFE(estimator=base_model, n_features_to_select=20, verbose=2)

rfe.fit(X_train, y_train)

selected_features = X.columns[rfe.support_]
print("Selected Features:")
print(selected_features)

X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

param_grids = {
    'Ridge': {'alpha': [0.1, 1, 10, 100]},
    'Lasso': {'alpha': [0.01, 0.1, 1, 10]},
    'ElasticNet': {'alpha': [0.01, 0.1, 1, 10], 'l1_ratio': [0.2, 0.5, 0.8]},
    'RandomForestRegressor': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'GradientBoostingRegressor': {'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [100, 200]},
    'AdaBoostRegressor': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1, 1]},
    'BaggingRegressor': {'n_estimators': [10, 50, 100]},
    'ExtraTreesRegressor': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'DecisionTreeRegressor': {'max_depth': [None, 10, 20]},
    'KNeighborsRegressor': {'n_neighbors': [3, 5, 10]},
    'SVR': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
}

models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'BaggingRegressor': BaggingRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'SVR': SVR(),
}

results_rfe = {}
best_models_rfe = {}

print('Training models with RFE-selected features...')
for name, model in tqdm(models.items()):
    grid_search = GridSearchCV(model, param_grids.get(name, {}), cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train_rfe, y_train)

    best_model = grid_search.best_estimator_
    best_rmse = np.sqrt(-grid_search.best_score_)  # RMSE

    results_rfe[name] = best_rmse
    best_models_rfe[name] = best_model

print('\nModel Performance with RFE (RMSE):')
for name, rmse in results_rfe.items():
    print(f'{name}: {rmse:.4f}')

print('\nBest Model with RFE:')
best_model_rfe = best_models_rfe[min(results_rfe, key=results_rfe.get)]
print(f'{best_model_rfe}: {results_rfe[min(results_rfe, key=results_rfe.get)]:.4f}')

print(best_model_rfe.get_params())

if hasattr(best_model_rfe, "feature_importances_"):
    feature_importances = zip(selected_features, best_model_rfe.feature_importances_)
    feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

    for feature, importance in feature_importances:
        print(f'{feature}: {importance:.4f}')

    plt.figure(figsize=(10, 5))
    import matplotlib.cm as cm
    colors = cm.viridis([x[1] for x in feature_importances])
    plt.barh([x[0] for x in feature_importances], [x[1] for x in feature_importances], color=colors)
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importances (RFE)')
    plt.tight_layout()

    for i, (feature, importance) in enumerate(feature_importances):
        plt.text(importance - 0.001, i, f'{importance:.4f}', ha='right', va='center', color='white')

    plt.show()


In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

X = categoric_df.drop(columns=['SalePrice'])
y = categoric_df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pca = PCA(n_components=0.95)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

explained_variance = np.cumsum(pca.explained_variance_ratio_)
print(f"Total explained variance with selected components: {explained_variance[-1]:.4f}")
print(f"Number of components selected: {pca.n_components_}")

param_grids = {
    'Ridge': {'alpha': [0.1, 1, 10, 100]},
    'Lasso': {'alpha': [0.01, 0.1, 1, 10]},
    'ElasticNet': {'alpha': [0.01, 0.1, 1, 10], 'l1_ratio': [0.2, 0.5, 0.8]},
    'RandomForestRegressor': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'GradientBoostingRegressor': {'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [100, 200]},
    'AdaBoostRegressor': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1, 1]},
    'BaggingRegressor': {'n_estimators': [10, 50, 100]},
    'ExtraTreesRegressor': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'DecisionTreeRegressor': {'max_depth': [None, 10, 20]},
    'KNeighborsRegressor': {'n_neighbors': [3, 5, 10]},
    'SVR': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
}

models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'BaggingRegressor': BaggingRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'SVR': SVR(),
}

results_pca = {}
best_models_pca = {}

print('Training models with PCA-transformed features...')
for name, model in tqdm(models.items()):
    grid_search = GridSearchCV(model, param_grids.get(name, {}), cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train_pca, y_train)

    best_model = grid_search.best_estimator_
    best_rmse = np.sqrt(-grid_search.best_score_)  # RMSE

    results_pca[name] = best_rmse
    best_models_pca[name] = best_model

# Display results
print('\nModel Performance with PCA (RMSE):')
for name, rmse in results_pca.items():
    print(f'{name}: {rmse:.4f}')

print('\nBest Model with PCA:')
best_model_pca = best_models_pca[min(results_pca, key=results_pca.get)]
print(f'{best_model_pca}: {results_pca[min(results_pca, key=results_pca.get)]:.4f}')

print(best_model_pca.get_params())

# Visualize explained variance
plt.figure(figsize=(8, 5))
plt.plot(explained_variance, marker='o', label="Cumulative Explained Variance")
plt.axhline(y=explained_variance[-1], color='r', linestyle='--', label=f"{explained_variance[-1]:.2f} variance explained")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("Explained Variance by PCA Components")
plt.legend()
plt.grid()
plt.show()


In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import mean_squared_error

model = Ridge()

param_grid = {'alpha': [0.1, 1, 10, 100]}

sfs = SequentialFeatureSelector(
    model,
    n_features_to_select='auto',
    direction='forward',
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1
)

sfs.fit(X_train, y_train)

X_train_sfs = sfs.transform(X_train)
X_test_sfs = sfs.transform(X_test)

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_sfs, y_train)

best_model = grid_search.best_estimator_
best_rmse = np.sqrt(-grid_search.best_score_)  # RMSE

print(f'Best Model: {best_model}')
print(f'Best RMSE: {best_rmse:.4f}')
print(best_model.get_params())

selected_features = X.columns[sfs.get_support()]
print(f"Selected Features for Ridge Model: {selected_features}")