## temp

In [37]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler


categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns
X_train[categorical_features] = X_train[categorical_features].apply(LabelEncoder().fit_transform)


# Initialize selector
selector = RFECV(
    estimator=LogisticRegression(max_iter=500, solver='liblinear'),
    step=1,
    cv=5,
    scoring='accuracy'
)

# Fit to data
selector.fit(X_train, y_train)
X_selected = selector.transform(X_train)


# Get results
print("Optimal features:", selector.n_features_)
print("Feature rankings:", selector.ranking_)


# Get optimal feature names
optimal_features = X.columns[selector.support_]
print("Optimal features:", list(optimal_features))

# Optional: Feature rankings
feature_rankings = pd.DataFrame({
    'Feature': X.columns,
    'Ranking': selector.ranking_
}).sort_values(by='Ranking')
print(feature_rankings)

Optimal features: 8
Feature rankings: [5 2 3 1 1 1 4 1 1 1 1 1]
Optimal features: ['married', 'education', 'arrears', 'housing', 'has_tv_package', 'last_contact', 'last_contact_this_campaign_month', 'this_campaign']
                             Feature  Ranking
3                            married        1
4                          education        1
5                            arrears        1
7                            housing        1
8                     has_tv_package        1
9                       last_contact        1
10  last_contact_this_campaign_month        1
11                     this_campaign        1
1                                age        2
2                                job        3
6                    current_balance        4
0                               town        5


## Data Transformations

- Missing, null - Imputation
- Outliers Removal
- Normalisation, scaling
- Categorical encoding
- Majority - ???
- Minority - SMOTE ... ?? but not for categorical

- ?? use minmaxscaler as data is non-gaussian?

from imblearn.over_sampling import SMOTENC`
obj = SMOTENC(categorical_features = [1,4])
ovsersampled_features, ovsersampled_target = obj.fit_sample(Features, Target)


from imblearn.over_sampling import SMOTENC
# Indices of categorical features (columns 1 and 2)
smote_nc = SMOTENC(categorical_features=[1, 2], random_state=42)
X_res, y_res = smote_nc.fit_resample(X, y)



In [30]:
# Encode & Scale The Data
# Use Column transformer to establish a repeatable setup for the data prep required

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

# Select the categorical and numerical features
#categorical_features = ['married', 'housing']
#numerical_features = ['age']
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns

print(f'Numerics {len(numerical_features)} \n', numerical_features)
print(f'Categoricals {len(categorical_features)} \n', categorical_features)

# Create the transformations
transformations = [
    ('cat', OneHotEncoder(drop='first', handle_unknown='infrequent_if_exist'), categorical_features), 
    ('num', StandardScaler(), numerical_features)
    #('num', MinMaxScaler(), numerical_features)
    ]
col_transform = ColumnTransformer(transformers=transformations)
col_transform.fit(X_train)



Numerics 3 
 Index(['age', 'current_balance', 'this_campaign'], dtype='object')
Categoricals 9 
 Index(['town', 'job', 'married', 'education', 'arrears', 'housing',
       'has_tv_package', 'last_contact', 'last_contact_this_campaign_month'],
      dtype='object')


In [None]:
# Perform the Transformation On All Data
# Target y_train, y_test is not transformed

X_train_encoded = col_transform.transform(X_train)
X_test_encoded = col_transform.transform(X_test)

display(f'X_Train Encoded: {X_train_encoded.shape}')
display(f'X_test_encoded: {X_test_encoded.shape}')
#display(col_transform.get_feature_names_out())

In [None]:
from imblearn.over_sampling import SMOTENC
# Specify indices of categorical features (e.g., first 3 columns)
smote_nc = SMOTENC(categorical_features=[0, 1, 2], random_state=42)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)
