In [39]:
import numpy as np
import pandas as pd
import pandas as pd
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
df_application_train = pd.read_csv("./data/application_train.csv")
df_application_test = pd.read_csv("./data/application_test.csv")

## Peform Basic Data Cleansing

In [11]:
# Drop columns with all NaN values
df_application_train = df_application_train.dropna(axis=1, how='all')

# Drop columns with more than 95% NaN values
threshold = len(df_application_train) * 0.95
df_application_train = df_application_train.dropna(axis=1, thresh=threshold)

# Drop NaN values
df_application_train = df_application_train.dropna()

print(df_application_train.isna().sum())

# Convert the DataFrame to numeric, forcing errors to NaN
df_application_train = df_application_train.apply(pd.to_numeric, errors='coerce')

# Now apply your condition
df_application_train[df_application_train < 0] = df_application_train[df_application_train < 0] * -1

# One-hot encode categorical variables
df_application_train = pd.get_dummies(df_application_train)

X = df_application_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y = df_application_train['TARGET']

CNT_CHILDREN                   0
AMT_INCOME_TOTAL               0
AMT_CREDIT                     0
AMT_ANNUITY                    0
AMT_GOODS_PRICE                0
REGION_POPULATION_RELATIVE     0
DAYS_BIRTH                     0
DAYS_EMPLOYED                  0
DAYS_REGISTRATION              0
DAYS_ID_PUBLISH                0
FLAG_MOBIL                     0
FLAG_EMP_PHONE                 0
FLAG_WORK_PHONE                0
FLAG_CONT_MOBILE               0
FLAG_PHONE                     0
FLAG_EMAIL                     0
CNT_FAM_MEMBERS                0
REGION_RATING_CLIENT           0
REGION_RATING_CLIENT_W_CITY    0
HOUR_APPR_PROCESS_START        0
REG_REGION_NOT_LIVE_REGION     0
REG_REGION_NOT_WORK_REGION     0
LIVE_REGION_NOT_WORK_REGION    0
REG_CITY_NOT_LIVE_CITY         0
REG_CITY_NOT_WORK_CITY         0
LIVE_CITY_NOT_WORK_CITY        0
EXT_SOURCE_2                   0
OBS_30_CNT_SOCIAL_CIRCLE       0
DEF_30_CNT_SOCIAL_CIRCLE       0
OBS_60_CNT_SOCIAL_CIRCLE       0
DEF_60_CNT

## Select best features with `Univariate Selection`
> This method uses statistical tests to select features that have the strongest relationship with the output variable.

In [12]:
# Apply SelectKBest class to extract top 10 best features
best_features = SelectKBest(score_func=chi2, k=10)
fit = best_features.fit(X, y)

# Get the scores for each feature
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# Concat two dataframes for better visualization
feature_scores = pd.concat([dfcolumns, dfscores], axis=1)
feature_scores.columns = ['Feature', 'Score']
print(feature_scores.nlargest(10, 'Score'))

                   Feature         Score
7            DAYS_EMPLOYED  1.955199e+08
4          AMT_GOODS_PRICE  1.204756e+08
2               AMT_CREDIT  7.558970e+07
6               DAYS_BIRTH  2.233681e+06
8        DAYS_REGISTRATION  1.352872e+06
1         AMT_INCOME_TOTAL  1.342819e+06
31  DAYS_LAST_PHONE_CHANGE  6.737951e+05
9          DAYS_ID_PUBLISH  6.119764e+05
3              AMT_ANNUITY  3.752007e+05
24  REG_CITY_NOT_WORK_CITY  6.128890e+02


In [27]:
univariate_features = set(X.columns[fit.get_support()])
univariate_features

{'AMT_ANNUITY',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'AMT_INCOME_TOTAL',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'DAYS_LAST_PHONE_CHANGE',
 'DAYS_REGISTRATION',
 'REG_CITY_NOT_WORK_CITY'}

## Select best features with `Recursive Feature Elimination (RFE)`
> RFE works by recursively removing the least important features and building the model with the remaining features. It uses the model accuracy to identify which features contribute the most.


In [13]:
model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
for f in range(X.shape[1]):
    print("%d. Column %s (%f)" %
          (f + 1, X.columns[indices[f]], importances[indices[f]]))

1. Column EXT_SOURCE_2 (0.101917)
2. Column DAYS_BIRTH (0.078488)
3. Column DAYS_ID_PUBLISH (0.076915)
4. Column DAYS_REGISTRATION (0.076806)
5. Column DAYS_EMPLOYED (0.070963)
6. Column AMT_ANNUITY (0.070892)
7. Column DAYS_LAST_PHONE_CHANGE (0.067922)
8. Column AMT_CREDIT (0.063631)
9. Column AMT_INCOME_TOTAL (0.055784)
10. Column REGION_POPULATION_RELATIVE (0.054365)
11. Column AMT_GOODS_PRICE (0.050947)
12. Column HOUR_APPR_PROCESS_START (0.048212)
13. Column OBS_30_CNT_SOCIAL_CIRCLE (0.024349)
14. Column OBS_60_CNT_SOCIAL_CIRCLE (0.024183)
15. Column CNT_FAM_MEMBERS (0.021511)
16. Column CNT_CHILDREN (0.014096)
17. Column FLAG_PHONE (0.009879)
18. Column FLAG_WORK_PHONE (0.008887)
19. Column DEF_30_CNT_SOCIAL_CIRCLE (0.008080)
20. Column REGION_RATING_CLIENT_W_CITY (0.006982)
21. Column REGION_RATING_CLIENT (0.006844)
22. Column DEF_60_CNT_SOCIAL_CIRCLE (0.006381)
23. Column FLAG_DOCUMENT_3 (0.006326)
24. Column REG_CITY_NOT_WORK_CITY (0.006270)
25. Column LIVE_CITY_NOT_WORK_CITY 

In [26]:
rfe_features = set(X.columns[indices[:10]])
rfe_features

{'AMT_ANNUITY',
 'AMT_CREDIT',
 'AMT_INCOME_TOTAL',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'DAYS_LAST_PHONE_CHANGE',
 'DAYS_REGISTRATION',
 'EXT_SOURCE_2',
 'REGION_POPULATION_RELATIVE'}

## Select best features with `Correlation Coefficient`
> It's is a measure of the linear relationship between 2 or more variables.

In [15]:
# Calculate the correlation matrix
corr_matrix = df_application_train.corr()

# Get the absolute correlation values with the target variable
important_features = corr_matrix['TARGET'].abs().sort_values(ascending=False)
important_features[:10]

TARGET                         1.000000
EXT_SOURCE_2                   0.160819
DAYS_BIRTH                     0.078636
REGION_RATING_CLIENT_W_CITY    0.060900
REGION_RATING_CLIENT           0.058904
DAYS_LAST_PHONE_CHANGE         0.055887
DAYS_ID_PUBLISH                0.051397
REG_CITY_NOT_WORK_CITY         0.051146
DAYS_EMPLOYED                  0.047290
FLAG_EMP_PHONE                 0.046223
Name: TARGET, dtype: float64

In [25]:
correlation_features = set(important_features.index[:10])
correlation_features

{'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'DAYS_LAST_PHONE_CHANGE',
 'EXT_SOURCE_2',
 'FLAG_EMP_PHONE',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'TARGET'}

## Select best features with `PCA`
> A dimensionality reduction technique to select the best features

In [16]:
# Fit PCA
pca = PCA(n_components=10)
pca.fit(X)

# Create a DataFrame for the loadings
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=X.columns)

# Display the loadings
print(loadings)

# Sort the loadings for the first principal component
sorted_loadings = loadings['PC1'].abs().sort_values(ascending=False)
print(sorted_loadings)

                                      PC1           PC2           PC3  \
CNT_CHILDREN                 2.629482e-09  6.167643e-08 -1.255103e-06   
AMT_INCOME_TOTAL             8.409090e-02  9.952276e-01  4.920000e-02   
AMT_CREDIT                   7.342335e-01 -6.491051e-02  6.676856e-03   
AMT_ANNUITY                  2.059913e-02  2.615302e-03 -4.824209e-03   
AMT_GOODS_PRICE              6.731188e-01 -5.487118e-02  1.309400e-02   
REGION_POPULATION_RELATIVE   2.628386e-09  3.196519e-09  7.184185e-10   
DAYS_BIRTH                   4.109482e-04 -1.047689e-03  1.969296e-02   
DAYS_EMPLOYED               -1.778928e-02 -4.785102e-02  9.984559e-01   
DAYS_REGISTRATION           -7.557823e-05 -4.798053e-04  5.388193e-03   
DAYS_ID_PUBLISH              1.619251e-05 -1.158727e-04  2.993441e-03   
FLAG_MOBIL                  -1.274670e-29 -4.098454e-30  8.722570e-29   
FLAG_EMP_PHONE               4.988091e-08  1.317245e-07 -2.750653e-06   
FLAG_WORK_PHONE             -7.895722e-09 -1.333310

In [28]:
pca_features = set(sorted_loadings.index[:10])
pca_features

{'AMT_ANNUITY',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'AMT_INCOME_TOTAL',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'DAYS_LAST_PHONE_CHANGE',
 'DAYS_REGISTRATION',
 'HOUR_APPR_PROCESS_START'}

In [31]:
# Find common features across all methods
common_features = (univariate_features
                   .intersection(rfe_features)
                   .intersection(pca_features))

# Display the first 10 common features
print("Common Features:", list(common_features)[:10])

Common Features: ['DAYS_EMPLOYED', 'DAYS_LAST_PHONE_CHANGE', 'DAYS_REGISTRATION', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_ID_PUBLISH', 'AMT_INCOME_TOTAL']


In [34]:
common_features_list = list(common_features)
df_train = df_application_train[common_features_list + ["TARGET", "SK_ID_CURR"]]
df_test = df_application_test[common_features_list + ["SK_ID_CURR"]]

In [38]:
df_test.dropna(inplace=True)

DAYS_EMPLOYED              0
DAYS_LAST_PHONE_CHANGE     0
DAYS_REGISTRATION          0
AMT_CREDIT                 0
AMT_ANNUITY               24
DAYS_BIRTH                 0
DAYS_ID_PUBLISH            0
AMT_INCOME_TOTAL           0
SK_ID_CURR                 0
dtype: int64

## Perform `StandardScaling` before exporting files

In [42]:
# Assuming df_train and df_test are already defined

# Separate features and target variable for df_train
X_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(df_test.drop(columns=["SK_ID_CURR"]))

# Create new DataFrames with scaled data
df_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
df_test_scaled = pd.DataFrame(X_test_scaled, columns=df_test.drop(columns=["SK_ID_CURR"]).columns)

# Add back the ID columns
df_train_scaled["TARGET"] = y_train.values
df_train_scaled["SK_ID_CURR"] = df_train["SK_ID_CURR"].values
df_test_scaled["SK_ID_CURR"] = df_test["SK_ID_CURR"].values

# Export to CSV
df_train_scaled.to_csv("./outputs/preprocessed_data/df_train.csv", index=False)
df_test_scaled.to_csv("./outputs/preprocessed_data/df_test.csv", index=False)