In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Melihat jumlah data yang memiliki target 0 dan 1 atau melihat client yang sulit melakukan pembayaran 
data=pd.read_csv("application_train.csv")
print(data['TARGET'].value_counts())

# Load dataset
df = pd.read_csv("application_test.csv")

# memilih kolom yang akan digunakan untuk prediksi
selected_columns = [
    "SK_ID_CURR", "NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "CNT_CHILDREN",
    "AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "DAYS_BIRTH", "DAYS_EMPLOYED",
    "REGION_RATING_CLIENT", "REGION_RATING_CLIENT_W_CITY", "EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"
]
df = df[selected_columns]


# Mengisi nilai kosong dengan median
df.fillna(df.select_dtypes(include=[np.number]).median(numeric_only=True), inplace=True)


# Mengubah data kategorikal menjadi numerik 
categorical_cols = df.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

# Mendefenisikan fitur dan target 
X = df.drop(columns=["SK_ID_CURR"])
y = (df["AMT_CREDIT"] > df["AMT_INCOME_TOTAL"].median()).astype(int)  # Example target

# Membagi data Training dan Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalisasi data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


TARGET
0    282686
1     24825
Name: count, dtype: int64
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       972
           1       1.00      1.00      1.00      8777

    accuracy                           1.00      9749
   macro avg       1.00      1.00      1.00      9749
weighted avg       1.00      1.00      1.00      9749

