# Loading some packages

In [6]:
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE
from sklearn.feature_selection import r_regression, SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_classif

from scipy import stats

from statsmodels.stats.outliers_influence import variance_inflation_factor

from boruta import BorutaPy

from collections import Counter

import os
from pathlib import Path

# Loading the data

In [8]:
DATA_PATH = Path(r'C:/\Users/\vchar/\OneDrive/\Desktop/\ML Projects/\portfolio/\ArmenianHousePrices/\notebooks/\data')

data = pd.read_csv(os.path.join(DATA_PATH, 'final_data.csv'))

cat_feats_list = list(data.select_dtypes(include=['object']).columns)
remove_cat_list =[
    'description', 'address', 'seller_type', 'total_description',
    'renewed_date', 'seller_id', 'region', 'geo_location', 'estate_type'
]
cat_feats_list = [i for i in cat_feats_list if i not in remove_cat_list]

ordinal_feats_list = ['Number of Rooms', 'Number of Bathrooms']

nominal_feats_list = [i for i in cat_feats_list if i not in ordinal_feats_list]

num_feats_list = list(data.select_dtypes(exclude=['object']).columns)
remove_num_feats_list = ['posted_date', 'is_capital', 'sqm_price_usd']
num_feats_list = [i for i in num_feats_list if i not in remove_num_feats_list]
# feats_name_list = [col for col in data.columns if col!='']

feats_name_list = []
feats_name_list.extend(cat_feats_list)
feats_name_list.extend(num_feats_list)

y = data['sqm_price_usd'].copy().to_frame()
X = data[feats_name_list].copy()

# Feature elimination

## Removing low variance features

In [None]:
num_feats_list = list(X.select_dtype(include=['']).columns)
cat_feats_list = list(X.select_dtype(include=['object']).columns)

variance_selector = VarianceThreshold(threshold=0.05)
X_selection = variance_selector.fit_transform(X[num_feats_list])

## Variance Inflation Factor

In [None]:
def calculate_vif(X):
    
    vif = pd.DataFrame()
    vif["features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return vif

def select_feats_via_vif(X):

    vif_df = calculate_vif(X)

    while vif_df[vif_df['VIF'] >=10].shape[0] != 0:
        vif_df.sort_values('VIF', ascending=False).reset_index(drop=True, inplace=True)
        elimination_candidate = vif_df.iloc[0]['features']
        new_X = X.drop(columns=elimination_candidate)
        vif_df = calculate_vif(new_X)

    return list(vif_df['features'].values)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
sfs = SequentialFeatureSelector(knn, n_features_to_select=3, direction=”forward”)
sfs.fit(X, y)
X_selection = sfs.transform(X)

svc = SVC(kernel="linear")
rfe = RFE(svc, n_features_to_select=3)
rfe.fit(X, y)
X_selection = rfe.transform(X)

## Feature selection class

In [None]:
class FSelector():

    def __init__(self, X, y, num_feats, ordinal_feats, nominal_feats, model):

        self.X = X
        self.y = y
        self.num_feats = num_feats
        self.ordinal_feats = ordinal_feats
        self.nominal_feats = nominal_feats
        self.model = model

    def calculate_vif(self, X):
    
        vif = pd.DataFrame()
        vif["features"] = X.columns
        vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

        return vif

    def select_feats_via_vif(self):

        vif_df = self.calculate_vif(self.X[self.num_feats])

        while vif_df[vif_df['VIF'] >=10].shape[0] != 0:
            vif_df.sort_values('VIF', ascending=False).reset_index(drop=True, inplace=True)
            elimination_candidate = vif_df.iloc[0]['features']
            new_X = self.X[self.num_feats].drop(columns=elimination_candidate)
            vif_df = calculate_vif(new_X)

        return list(vif_df['features'].values)
    
    def get_spearmanr(self):
        return [stats.spearmanr(self.X.values[:, f], self.y.values).correlation for f in range(self.X.shape[1])]
    
    def get_kendalltau(self):
        return [stats.kendalltau(self.X.values[:, f], self.y.values).correlation for f in range(self.X.shape[1])]
    
    def get_pointbiserialr(self):
        return [stats.pointbiserialr(self.X.values[:, f], self.y.values).correlation for f in range(self.X.shape[1])]
    
    def get_boruto_feats(self, model):
        feat_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=1)
        feat_selector.fit(self.X, self.y)
        return feat_selector.support_
    
    def get_votes(self):

        if self.num_feats is not None:

            self.vif_feats = self.select_feats_via_vif()

            self.pearson_feats = SelectKBest(r_regression, k=15).fit_transform(self.X[self.num_feats], self.y).columns

            self.num_spearmanr_feats = SelectKBest(self.get_spearmanr, k=15).fit_transform(self.X[self.num_feats], self.y).columns

            self.num_kendalltau_feats = SelectKBest(self.get_kendalltau, k=15).fit_transform(self.X[self.num_feats], self.y).columns

            self.selected_num_feats = [self.pearson_feats]
            self.selected_num_feats.extend(self.num_spearmanr_feats)
            self.selected_num_feats.extend(self.num_kendalltau_feats)
            # self.selected_num_feats = list(set(self.selected_num_feats))

        if self.ordinal_feats is not None:

            self.ordinal_spearmanr_feats = SelectKBest(self.get_spearmanr, k=15).fit_transform(self.X[self.ordinal_feats], self.y).columns

            self.ordinal_kendalltau_feats = SelectKBest(self.get_kendalltau, k=15).fit_transform(self.X[self.ordinal_feats], self.y).columns

            self.selected_ordinal_feats = [self.ordinal_spearmanr_feats]
            self.selected_ordinal_feats.extend(self.ordinal_kendalltau_feats)
            # self.selected_ordinal_feats = list(set(self.selected_ordinal_feats))

        if self.nominal_feats is not None:

            # self.f_feats = f_classif(self.X[self.nominal_feats], self.y)[0]
            self.f_feats = SelectKBest(f_classif, k=15).fit_transform(self.X[self.nominal_feats], self.y).columns
            
            # self.mi_feats = mutual_info_regression(self.X[self.nominal_feats], self.y)
            self.mi_feats = SelectKBest(mutual_info_regression, k=15).fit_transform(self.X[self.nominal_feats], self.y).columns

            self.selected_nominal_feats = [self.f_feats]
            self.selected_nominal_feats.extend(self.mi_feats)
            # self.selected_nominal_feats = list(set(self.selected_nominal_feats))

        if self.model is not None:
            self.boruto_feats =  self.get_boruto_feats(self.model)

        self.selected_num_feats.extend(self.boruto_feats)
        num_feats_dict = dict(Counter(self.selected_num_feats))
        self.selected_num_feats = [i for i in num_feats_dict if num_feats_dict[i] > 2]

        self.selected_ordinal_feats.extend(self.boruto_feats)
        ordinal_feats_dict = dict(Counter(self.selected_ordinal_feats))
        self.selected_ordinal_feats = [i for i in ordinal_feats_dict if ordinal_feats_dict[i] >= 2]

        self.selected_nominal_feats.extend(self.boruto_feats)
        nominal_feats_dict = dict(Counter(self.selected_nominal_feats))
        self.selected_nominal_feats = [i for i in nominal_feats_dict if nominal_feats_dict[i] >= 2]

        self.selected_feats = [self.selected_num_feats]
        self.selected_feats.extend(self.selected_ordinal_feats)
        self.selected_feats.extend(self.selected_nominal_feats)
        self.selected_feats.extend(self.boruto_feats)

        return self.selected_feats

## Fold creation

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

cv_split_list = []

for train_idxs, valid_idxs in kfold.split(X):
    cv_split_list.append((train_idxs, valid_idxs))

In [None]:
# remove numeric features with low variance
# use vif to eliminate correlated features
# use mutual_info for preliminary feature elimination
# RandomForest: use scores to eliminate features
# XGBoost: use scores to eliminate features
# LightGBM: use scores to eliminate features

# create one hot features
# Lasso: eliminate features with 0 coefficients
# SVM: 

In [None]:
# num_pipeline = Pipeline(
#     steps=[
#         ('imputer', SimpleImputer(strategy='median')),
#         ('scaler', StandardScaler())
#     ]
# )

# cat_pipeline = Pipeline(
#     steps=[
#         ('imputer', SimpleImputer(strategy='most_frequent')),
#         ('ordinalencoder', OrdinalEncoder(categories=[list different categories here])),
#         ('scaler', StandardScaler())
#     ]
# )

# preprocessor = ColumnTransformer(
#     ('num_pipeline', num_pipeline, numerical_cols),
#     ('cat_pipeline', cat_pipeline, cat_cols)
# )

# preprocessor.fit_transform(), preprocessor.get_feature_names_out()