In [21]:
import numpy as np
import pandas as pd
import os

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from imblearn.over_sampling import RandomOverSampler

import warnings
warnings.filterwarnings("ignore")

# Read Datasets

In [55]:
path = r'C:\Users\rifky\Downloads\Rakamin Project-Based Virtual Internship\Home Credit Indonesia (Data Science)\Final Project\dataset'

app_train = pd.read_csv(os.path.join(path, 'application_train.csv'))
print('Train data shape:  ', app_train.shape)
app_test = pd.read_csv(os.path.join(path, 'application_test.csv'))
print('Test data shape:  ', app_test.shape) 

Train data shape:   (307511, 122)
Test data shape:   (48744, 121)


# Data Preprocessing

## Redo previous changes in EDA

In [56]:
days_col = [col for col in app_train if 'DAYS' in col]

app_train[days_col] = app_train[days_col].abs()
app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED'] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

# Lakukan hal yang sama pada data test
app_test[days_col] = app_test[days_col].abs()
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

## Handle Missing Values above 50%

In [57]:
missing_percent = app_train.isna().mean() * 100

missing_over_50 = missing_percent[missing_percent > 50].index

app_train_after_drop = app_train.drop(columns=missing_over_50)
app_test = app_test.drop(columns=missing_over_50, errors='ignore')

print('Dropped Columns: \n', missing_percent[missing_percent > 50].index)
print('Train data shape before drop: ', app_train.shape)
print('Train data shape after drop: ', app_train_after_drop.shape)

Dropped Columns: 
 Index(['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG',
       'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG',
       'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG',
       'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG',
       'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BUILD_MODE',
       'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMIN_MODE',
       'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE',
       'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI',
       'BASEMENTAREA_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI',
       'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI',
       'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI',
       'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
       'WALLSMATERIAL_MODE'],
      dtype='object')
Train data shape before drop:  (307511, 123)
Train dat

### Categorical Missing Values

In [58]:
app_train = app_train_after_drop

In [59]:
app_train_cat = app_train.select_dtypes(object)
app_train_cat = app_train_cat.replace('XNA', 'Unknown')

# mengisi missing value dengan 'Unknown'
app_train_cat['OCCUPATION_TYPE'] = app_train['OCCUPATION_TYPE'].fillna('Unknown')
app_train_cat['EMERGENCYSTATE_MODE'] = app_train['EMERGENCYSTATE_MODE'].fillna('Unknown')

# membuang baris dengan 'CODE_GENDER' dan 'NAME_FAMILY_STATUS' yang bernilai 'Unknown' karena hanya sedikit
app_train_cat = app_train_cat[app_train_cat['CODE_GENDER'] != 'Unknown']
app_train_cat = app_train_cat[app_train_cat['NAME_FAMILY_STATUS'] != 'Unknown']

In [60]:
# Terapkan hal yang sama di test
app_test_cat = app_test.select_dtypes(object)
app_test_cat = app_test_cat.replace('XNA', 'Unknown')

app_test_cat['OCCUPATION_TYPE'] = app_test['OCCUPATION_TYPE'].fillna('Unknown')
app_test_cat['EMERGENCYSTATE_MODE'] = app_test['EMERGENCYSTATE_MODE'].fillna('Unknown')

app_test_cat = app_test_cat[app_test_cat['CODE_GENDER'] != 'Unknown']
app_test_cat = app_test_cat[app_test_cat['NAME_FAMILY_STATUS'] != 'Unknown']

#### Untuk nilai null (NaN) di kolom 'EMERGENCYSTATE_MODE', 'OCCUPATION_TYPE', dan 'NAME_TYPE_SUITE' akan di-handle setelah train_test_split

In [61]:
# Sinkronkan baris dengan app_train_cat yang sudah dibersihkan
app_train = app_train.loc[app_train_cat.index]
app_test = app_test.loc[app_test_cat.index]

# Lalu update kolom objeknya
app_train.update(app_train_cat)
app_test.update(app_test_cat)

In [62]:
app_test_id = app_test['SK_ID_CURR']

app_train.drop('SK_ID_CURR', axis=1, inplace=True)
app_test.drop('SK_ID_CURR', axis=1, inplace=True)

### > Impute Numerical, Binary, and Categorical Missing Values

In [63]:
def fit_imputer(df):

    # Deteksi kolom numerik & binary
    binary_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
                   if set(df[col].dropna().unique()) <= {0, 1}]
    
    num_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
                    if col not in binary_cols]

    cat_cols = ['NAME_TYPE_SUITE']
    
    imputer_mode = SimpleImputer(strategy='most_frequent')
    imputer_med = SimpleImputer(strategy='median')
    
    imputer = ColumnTransformer(
        transformers=[
            ('bin', imputer_mode, binary_cols), 
            ('num', imputer_med, num_cols),
            ('cat', imputer_mode, cat_cols)
        ],
        remainder='passthrough'
    )

    imputer.fit(df)

    return imputer, binary_cols + num_cols + cat_cols

In [64]:
def transform_imputer(df, imputer, ordered_cols):
    df_array = imputer.transform(df)

    # Ambil nama semua kolom (yang ikut maupun tidak ikut transformasi)
    all_columns = ordered_cols + [col for col in df.columns if col not in ordered_cols]

    # Kembalikan dataframe hasil imputasi
    return pd.DataFrame(df_array, columns=all_columns, index=df.index)

In [65]:
X = app_train.drop('TARGET', axis=1)
y = app_train['TARGET']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=50)

In [66]:
imputer, imputed_cols = fit_imputer(X_train)

X_train_filled = transform_imputer(X_train, imputer, imputed_cols)
X_val_filled  = transform_imputer(X_val, imputer, imputed_cols)
app_test_filled  = transform_imputer(app_test, imputer, imputed_cols)

In [67]:
original_dtypes = app_train.dtypes

def ori_dtype(df):
    for col in df.columns:
        if col in original_dtypes:
            try:
                df[col] = df[col].astype(original_dtypes[col])
            except Exception as e:
                print(f"Gagal casting kolom '{col}' ke {original_dtypes[col]}: {e}")
    return df

In [68]:
X_train_filled = ori_dtype(X_train_filled)
X_val_filled = ori_dtype(X_val_filled)
app_test_filled = ori_dtype(app_test_filled)

### > Encoding Numerical Variable

In [69]:
from sklearn.preprocessing import LabelEncoder

le_dict = {}
le_count = 0

for col in X_train_filled.select_dtypes(include='object').columns:
    if X_train_filled[col].nunique() == 2:
        le = LabelEncoder()
        le.fit(X_train_filled[col])
        
        X_train_filled[col] = le.transform(X_train_filled[col])
        X_val_filled[col] = X_val_filled[col].map(lambda x: le.transform([x])[0] if x in set(le.classes_) else np.nan)
        app_test_filled[col] = app_test_filled[col].map(lambda x: le.transform([x])[0] if x in set(le.classes_) else np.nan)
        
        le_dict[col] = le
        le_count += 1

print(f"{le_count} columns were label encoded")

4 columns were label encoded


### Encoding Categorical Variable

In [70]:
# Tentukan kolom kategorikal
cat_cols = X_train_filled.select_dtypes(include='object').columns.tolist()

# Pisahkan berdasarkan jumlah kategori
low_card_cols = [col for col in cat_cols if X_train_filled[col].nunique() <= 10]
high_card_cols = [col for col in cat_cols if X_train_filled[col].nunique() > 10]

# One-Hot Encoding untuk low cardinality
X_train_low = pd.get_dummies(X_train_filled[low_card_cols], drop_first=True)
X_val_low = pd.get_dummies(X_val_filled[low_card_cols], drop_first=True)
app_test_low = pd.get_dummies(app_test_filled[low_card_cols], drop_first=True)

# Pastikan kolom sama di train & test (karena drop_first bisa bikin beda)
X_val_low = X_val_low.reindex(columns=X_train_low.columns, fill_value=0)
app_test_low = app_test_low.reindex(columns=X_train_low.columns, fill_value=0)

# Encode high-cardinality dengan reduksi top-N kategori
top_k = 15
X_train_high_encoded = []
X_val_high_encoded = []
app_test_high_encoded = []

for col in high_card_cols:
    top_cats = X_train_filled[col].value_counts().nlargest(top_k).index
    
    # Replace rare dengan 'Other'
    train_reduced = X_train_filled[col].where(X_train_filled[col].isin(top_cats), 'Other')
    test_reduced = X_val_filled[col].where(X_val_filled[col].isin(top_cats), 'Other')
    app_test_reduced = app_test_filled[col].where(app_test_filled[col].isin(top_cats), 'Other')
    
    # One-hot encode
    train_encoded = pd.get_dummies(train_reduced, prefix=col, drop_first=True)
    test_encoded = pd.get_dummies(test_reduced, prefix=col, drop_first=True)
    app_test_encoded = pd.get_dummies(app_test_reduced, prefix=col, drop_first=True)
    
    # Reindex test supaya sama kolomnya
    test_encoded = test_encoded.reindex(columns=train_encoded.columns, fill_value=0)
    app_test_encoded = app_test_encoded.reindex(columns=train_encoded.columns, fill_value=0)
    
    X_train_high_encoded.append(train_encoded)
    X_val_high_encoded.append(test_encoded)
    app_test_high_encoded.append(app_test_encoded)

# Gabungkan semua hasil encoding
X_train_encoded = pd.concat(
    [X_train_filled.drop(columns=cat_cols), X_train_low] + X_train_high_encoded, axis=1)

X_val_encoded = pd.concat(
    [X_val_filled.drop(columns=cat_cols), X_val_low] + X_val_high_encoded, axis=1)

app_test_encoded = pd.concat(
    [app_test_filled.drop(columns=cat_cols), app_test_low] + app_test_high_encoded, axis=1)

# Cek hasil akhir
print("Train encoded shape:", X_train_encoded.shape)
print("Validation encoded shape:", X_val_encoded.shape)
print("application_test encoded shape:", app_test_encoded.shape)

Train encoded shape: (246004, 134)
Validation encoded shape: (61501, 134)
application_test encoded shape: (48744, 134)


# Feature Selection

### Check Correlation

In [71]:
flag_col = [col for col in X_train_encoded.columns if 'FLAG_DOCUMENT' in col]
corr_with_y = X_train_encoded[flag_col].corrwith(y_train)
corr_with_y.sort_values(ascending=False)

FLAG_DOCUMENT_3     0.043661
FLAG_DOCUMENT_2     0.006476
FLAG_DOCUMENT_21    0.004215
FLAG_DOCUMENT_20    0.000977
FLAG_DOCUMENT_5     0.000295
FLAG_DOCUMENT_12   -0.000597
FLAG_DOCUMENT_19   -0.000717
FLAG_DOCUMENT_10   -0.001336
FLAG_DOCUMENT_7    -0.001514
FLAG_DOCUMENT_4    -0.002535
FLAG_DOCUMENT_11   -0.002717
FLAG_DOCUMENT_17   -0.003083
FLAG_DOCUMENT_9    -0.003697
FLAG_DOCUMENT_15   -0.006177
FLAG_DOCUMENT_8    -0.008319
FLAG_DOCUMENT_18   -0.008480
FLAG_DOCUMENT_14   -0.008793
FLAG_DOCUMENT_16   -0.010758
FLAG_DOCUMENT_13   -0.011285
FLAG_DOCUMENT_6    -0.028417
dtype: float64

In [72]:
flag_to_drop = corr_with_y.sort_values(ascending=False).index[1:]

X_train_encoded = X_train_encoded.drop(columns=flag_to_drop)
X_val_encoded = X_val_encoded.drop(columns=flag_to_drop)
app_test_encoded = app_test_encoded.drop(columns=flag_to_drop)

In [73]:
corr_matrix = X_train_encoded.corr().abs()

# Ambil segitiga atas matriks korelasi (untuk menghindari duplikat dan korelasi diri sendiri)
upper = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
high_corr_pairs = corr_matrix.where(upper)

# Tampilkan pasangan kolom dengan korelasi > 0.85
high_corr = high_corr_pairs.stack()[high_corr_pairs.stack() > 0.85]
print(high_corr)

FLAG_EMP_PHONE                        DAYS_EMPLOYED_ANOM                                   0.999862
                                      NAME_INCOME_TYPE_Pensioner                           0.999545
                                      ORGANIZATION_TYPE_Unknown                            0.999862
REG_REGION_NOT_WORK_REGION            LIVE_REGION_NOT_WORK_REGION                          0.861056
CNT_CHILDREN                          CNT_FAM_MEMBERS                                      0.879117
AMT_CREDIT                            AMT_GOODS_PRICE                                      0.986672
REGION_RATING_CLIENT                  REGION_RATING_CLIENT_W_CITY                          0.950630
YEARS_BEGINEXPLUATATION_AVG           YEARS_BEGINEXPLUATATION_MODE                         0.971808
                                      YEARS_BEGINEXPLUATATION_MEDI                         0.993555
FLOORSMAX_AVG                         FLOORSMAX_MODE                                       0.986532


In [74]:
high_corr_feat = ['FLAG_EMP_PHONE', 'DAYS_EMPLOYED_ANOM', 'NAME_INCOME_TYPE_Pensioner', 'ORGANIZATION_TYPE_Unknown',
                  'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'CNT_CHILDREN', 'CNT_FAM_MEMBERS',
                  'AMT_CREDIT', 'AMT_GOODS_PRICE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
                  'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BEGINEXPLUATATION_MEDI', 'FLOORSMAX_AVG',
                  'FLOORSMAX_MODE', 'FLOORSMAX_MEDI', 'OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
                  'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'NAME_EDUCATION_TYPE_Higher education',
                  'NAME_EDUCATION_TYPE_Secondary / secondary special']

high_corr_y = X_train_encoded[high_corr_feat].corrwith(y_train)
high_corr_y.sort_values(ascending=False)

REGION_RATING_CLIENT_W_CITY                          0.060405
REGION_RATING_CLIENT                                 0.057964
NAME_EDUCATION_TYPE_Secondary / secondary special    0.049616
FLAG_EMP_PHONE                                       0.046821
DEF_30_CNT_SOCIAL_CIRCLE                             0.032636
DEF_60_CNT_SOCIAL_CIRCLE                             0.031422
CNT_CHILDREN                                         0.019697
OBS_30_CNT_SOCIAL_CIRCLE                             0.009763
OBS_60_CNT_SOCIAL_CIRCLE                             0.009652
CNT_FAM_MEMBERS                                      0.009467
REG_REGION_NOT_WORK_REGION                           0.008444
LIVE_REGION_NOT_WORK_REGION                          0.003552
YEARS_BEGINEXPLUATATION_MODE                        -0.003639
YEARS_BEGINEXPLUATATION_AVG                         -0.004517
YEARS_BEGINEXPLUATATION_MEDI                        -0.004649
AMT_CREDIT                                          -0.030762
FLOORSMA

In [75]:
columns_to_drop = ['FLAG_EMP_PHONE', 'AMT_GOODS_PRICE', 'FLOORSMAX_MODE', 'FLOORSMAX_MEDI', 
                   'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BEGINEXPLUATATION_AVG', 'REGION_RATING_CLIENT',
                   'ORGANIZATION_TYPE_Unknown', 'DAYS_EMPLOYED_ANOM', 'NAME_EDUCATION_TYPE_Secondary / secondary special',
                   'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'CNT_FAM_MEMBERS', 'LIVE_REGION_NOT_WORK_REGION']

X_train_encoded = X_train_encoded.drop(columns=columns_to_drop)
X_val_encoded = X_val_encoded.drop(columns=columns_to_drop)
app_test_encoded = app_test_encoded.drop(columns=columns_to_drop)

In [76]:
print("Train encoded shape:", X_train_encoded.shape)
print("test encoded shape:", X_val_encoded.shape)
print("application_test encoded shape:", app_test_encoded.shape)

Train encoded shape: (246004, 101)
test encoded shape: (61501, 101)
application_test encoded shape: (48744, 101)


### Scaling with MinMaxScaler

In [98]:
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train_encoded)
X_val_scaled = mms.transform(X_val_encoded)
final_app_test = mms.transform(app_test_encoded)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_encoded.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val_encoded.columns)
final_app_test = pd.DataFrame(final_app_test, columns=app_test_encoded.columns)

### SelectKBest Method

In [107]:
class FeatureSelector:
    def __init__(self, method, k=30):
        """
        method: fungsi scoring
        k: jumlah fitur terbaik yang dipilih
        """
        self.method = method
        self.k = k
        self.selector = SelectKBest(score_func=self.method, k=self.k)
        self.feature_scores_ = None
        self.selected_features_ = None

    def fit(self, X, y):
        """Melatih selector dan menyimpan skor fitur"""
        self.selector.fit(X, y)
        self.feature_scores_ = pd.DataFrame({
            "Features": X.columns,
            "Score": self.selector.scores_
        }).sort_values(by="Score", ascending=False)
        self.selected_features_ = X.columns[self.selector.get_support()]
        return self

    def show_top_features(self, n=10):
        """Menampilkan n fitur dengan skor tertinggi"""
        if self.feature_scores_ is None:
            raise ValueError("⚠️ Selector belum di-fit. Jalankan .fit(X, y) dulu.")
        print(f"Top {n} features that correlate well with target feature:")
        display(self.feature_scores_.head(n))

    def transform(self, X):
        """Transform dataset menggunakan fitur terpilih"""
        if self.selector is None:
            raise ValueError("⚠️ Selector belum di-fit. Jalankan .fit(X, y) dulu.")
        X_selected = self.selector.transform(X)
        return pd.DataFrame(X_selected, columns=self.selected_features_)

    def fit_transform(self, X, y):
        """Langsung fit dan transform sekaligus"""
        self.fit(X, y)
        return self.transform(X)

#### Using chi2

In [108]:
fs_chi2 = FeatureSelector(method=chi2, k=30)

fs_chi2.fit(X_train_scaled, y_train)

# Lihat top 10 fitur terbaik
fs_chi2.show_top_features(n=10)

# Transform data (train, val, test)
X_train_chi2 = fs_chi2.transform(X_train_scaled)
X_val_chi2 = fs_chi2.transform(X_val_scaled)
app_test_chi2 = fs_chi2.transform(final_app_test)

Top 10 features that correlate well with target feature:


Unnamed: 0,Features,Score
53,NAME_EDUCATION_TYPE_Higher education,595.293713
22,EXT_SOURCE_2,530.926053
8,REG_CITY_NOT_WORK_CITY,523.532429
37,CODE_GENDER,496.911558
7,REG_CITY_NOT_LIVE_CITY,462.802338
48,NAME_INCOME_TYPE_Pensioner,446.366594
52,NAME_INCOME_TYPE_Working,397.922596
78,OCCUPATION_TYPE_Laborers,394.546493
23,EXT_SOURCE_3,393.898856
86,OCCUPATION_TYPE_Unknown,258.9048


#### Using mutual_info_classif

In [109]:
fs_mi = FeatureSelector(method=mutual_info_classif, k=30)

fs_mi.fit(X_train_scaled, y_train)

# Lihat top 10 fitur terbaik
fs_mi.show_top_features(n=10)

# Transform data (train, val, test)
X_train_mi = fs_mi.transform(X_train_scaled)
X_val_mi = fs_mi.transform(X_val_scaled)
app_test_mi = fs_mi.transform(final_app_test)

Top 10 features that correlate well with target feature:


Unnamed: 0,Features,Score
2,FLAG_CONT_MOBILE,0.048261
10,FLAG_DOCUMENT_3,0.045854
39,FLAG_OWN_REALTY,0.045131
45,NAME_TYPE_SUITE_Unaccompanied,0.044428
56,NAME_FAMILY_STATUS_Married,0.042998
60,NAME_HOUSING_TYPE_House / apartment,0.039292
52,NAME_INCOME_TYPE_Working,0.036213
0,FLAG_MOBIL,0.035024
71,EMERGENCYSTATE_MODE_Unknown,0.031298
20,REGION_RATING_CLIENT_W_CITY,0.020718


#### Oversampling to balance the Target

In [110]:
ros = RandomOverSampler(random_state=31)
X_train_upsampled_chi2, y_train_upsampled_chi2 = ros.fit_resample(X_train_chi2, y_train)
X_train_upsampled_mi, y_train_upsampled_mi = ros.fit_resample(X_train_mi, y_train)

print('Training Data shape (chi2): ', X_train_upsampled_chi2.shape)
print('Target class (chi2): ', y_train_upsampled_chi2.value_counts())

print('Training Data shape (mi): ', X_train_upsampled_mi.shape)
print('Target class (mi): ', y_train_upsampled_mi.value_counts())

Training Data shape (chi2):  (452288, 30)
Target class (chi2):  TARGET
0    226144
1    226144
Name: count, dtype: int64
Training Data shape (mi):  (452288, 30)
Target class (mi):  TARGET
0    226144
1    226144
Name: count, dtype: int64


In [114]:
# Save train/val data to csv
X_train_upsampled_chi2.to_csv(os.path.join(path, 'X_train_upsampled_chi2.csv'), index=False)
y_train_upsampled_chi2.to_csv(os.path.join(path, 'y_train_upsampled_chi2.csv'), index=False)
X_train_upsampled_mi.to_csv(os.path.join(path, 'X_train_upsampled_mi.csv'), index=False)
y_train_upsampled_mi.to_csv(os.path.join(path, 'y_train_upsampled_mi.csv'), index=False)
X_val_chi2.to_csv(os.path.join(path, 'X_val_chi2.csv'), index=False)
X_val_mi.to_csv(os.path.join(path, 'X_val_mi.csv'), index=False)
y_val.to_csv(os.path.join(path, 'y_val.csv'), index=False)
app_test_chi2.to_csv(os.path.join(path, 'app_test_chi2.csv'), index=False)
app_test_id.to_csv(os.path.join(path, 'app_test_id.csv'), index=False)