In [1]:
pip install gdown

Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable


In [2]:
pip install dask[dataframe]

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import os
from dotenv import load_dotenv
import gdown
import requests
import dask.dataframe as dd


# Load environment variables from the .env file
load_dotenv()

# Function to construct Google Drive direct download link
def get_google_drive_url(file_id):
    return f"https://drive.google.com/uc?id={file_id}"

# Get file IDs from the .env file
file_ids = {
    "application_train": os.getenv("APPLICATION_TRAIN_ID"),
    "application_test": os.getenv("APPLICATION_TEST_ID"),
    "bureau": os.getenv("BUREAU_ID"),
    "bureau_balance": os.getenv("BUREAU_BALANCE_ID"),
    "credit_card_balance": os.getenv("CREDIT_CARD_BALANCE_ID"),
    "installments_payments": os.getenv("INSTALLMENTS_PAYMENTS_ID"),
    "previous_application": os.getenv("PREVIOUS_APPLICATION_ID"),
    "POS_CASH_balance": os.getenv("POS_CASH_BALANCE_ID"),
}

# Construct direct download links
google_drive_links = {key: get_google_drive_url(value) for key, value in file_ids.items()}

# Function to download a file using gdown, only if not already downloaded
def download_csv(file_url, output_path):
    if not os.path.exists(output_path):  # Check if file already exists
        print(f"Downloading {output_path}...")
        try:
            gdown.download(file_url, output_path, quiet=False)
            print(f"Downloaded {output_path}")
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {file_url}: {e}")
    else:
        print(f"{output_path} already exists.")

# Define the output file paths
output_paths = {
    "application_train": "application_train.csv",
    "application_test": "application_test.csv",
    "bureau": "bureau.csv",
    "bureau_balance": "bureau_balance.csv",
    "credit_card_balance": "credit_card_balance.csv",
    "installments_payments": "installments_payments.csv",
    "previous_application": "previous_application.csv",
    "POS_CASH_balance": "POS_CASH_balance.csv"
}

# Download the datasets
for key, file_url in google_drive_links.items():
    download_csv(file_url, output_paths[key])

# Load datasets from local files
try:
    app_train = dd.read_csv(output_paths["application_train"], on_bad_lines='skip')
    app_test = dd.read_csv(output_paths["application_test"], on_bad_lines='skip')
    bureau = dd.read_csv(output_paths["bureau"], on_bad_lines='skip')
    bureau_balance = dd.read_csv(output_paths["bureau_balance"], on_bad_lines='skip')
    credit_card_balance = dd.read_csv(output_paths["credit_card_balance"], on_bad_lines='skip')
    installments_payments = dd.read_csv(output_paths["installments_payments"], on_bad_lines='skip')
    previous_application = dd.read_csv(output_paths["previous_application"], on_bad_lines='skip')
    POS_CASH_balance = dd.read_csv(output_paths["POS_CASH_balance"], on_bad_lines='skip')
except Exception as e:
    print(f"Error loading CSV files: {e}")

# Example: Print the first few rows of the application_train dataset
print(app_train.head())


application_train.csv already exists.
application_test.csv already exists.
bureau.csv already exists.
bureau_balance.csv already exists.
credit_card_balance.csv already exists.
installments_payments.csv already exists.
previous_application.csv already exists.
POS_CASH_balance.csv already exists.
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    1

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Function to reduce memory usage
def reduce_memory_usage(df):
    if isinstance(df, dd.DataFrame):
        df = df.compute()  # Convert to Pandas
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:  # Exclude string columns
            if pd.api.types.is_integer_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='integer')
            elif pd.api.types.is_float_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='float')
    return df

# Load credit_card_balance using Dask
credit_card_balance = dd.read_csv('credit_card_balance.csv', assume_missing=True)

# Reduce memory usage for app_train and app_test
app_train = reduce_memory_usage(app_train)
app_test = reduce_memory_usage(app_test)

# Select specific columns from credit_card_balance for merging
columns_to_merge = ['SK_ID_CURR', 'AMT_BALANCE', 'SK_DPD']
credit_card_balance_selected = credit_card_balance[columns_to_merge].compute()

# Cast SK_ID_CURR to int64 to avoid type mismatches during merging
app_train['SK_ID_CURR'] = app_train['SK_ID_CURR'].astype('int64')
app_test['SK_ID_CURR'] = app_test['SK_ID_CURR'].astype('int64')
credit_card_balance_selected['SK_ID_CURR'] = credit_card_balance_selected['SK_ID_CURR'].astype('int64')

# Merge app_train and credit_card_balance on SK_ID_CURR
app_train = app_train.merge(credit_card_balance_selected, on='SK_ID_CURR', how='left')
app_test = app_test.merge(credit_card_balance_selected, on='SK_ID_CURR', how='left')

# Combine datasets for processing
merged_data = pd.concat([app_train, app_test], axis=0, ignore_index=True)

# Handle missing input parameters
input_parameters = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 'CNT_CHILDREN',
    'FLAG_OWN_CAR', 'CODE_GENDER', 'DAYS_CREDIT', 'DAYS_DECISION', 'AMT_PAYMENT',
    'AMT_INSTALMENT', 'AMT_APPLICATION', 'NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE',
    'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE'
]

# Add missing input parameters with NaN
for col in input_parameters:
    if col not in merged_data.columns:
        merged_data[col] = np.nan

# Convert FLAG_OWN_CAR from 'Y'/'N' to 1/0
merged_data['FLAG_OWN_CAR'] = merged_data['FLAG_OWN_CAR'].map({'Y': 1, 'N': 0})

# Separate numeric and categorical columns
numeric_columns = merged_data.select_dtypes(include='number').columns.tolist()
categorical_columns = merged_data.select_dtypes(include='object').columns.tolist()

# Impute missing values
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

merged_data[numeric_columns] = numeric_imputer.fit_transform(merged_data[numeric_columns])

for col in categorical_columns:
    if col in merged_data.columns:
        merged_data[col] = categorical_imputer.fit_transform(merged_data[[col]])

# One-hot encode categorical columns
categorical_columns_to_encode = ['CODE_GENDER', 'NAME_CONTRACT_TYPE', 'NAME_FAMILY_STATUS', 
                                  'NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE']
merged_data = pd.get_dummies(merged_data, columns=categorical_columns_to_encode, drop_first=True)

# Scale numerical features
numerical_columns = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY',
    'SK_DPD', 'CNT_CHILDREN', 'DAYS_CREDIT', 'DAYS_DECISION',
    'AMT_PAYMENT', 'AMT_INSTALMENT', 'AMT_APPLICATION'
]

existing_numerical_columns = [col for col in numerical_columns if col in merged_data.columns]
scaler = MinMaxScaler()
merged_data[existing_numerical_columns] = scaler.fit_transform(merged_data[existing_numerical_columns])

# Prepare training data
if 'TARGET' in merged_data.columns:
    training_data = merged_data[input_parameters + ['TARGET']]
else:
    raise ValueError("TARGET column is missing from the dataset.")

X = training_data[input_parameters]
y = training_data['TARGET']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the first 10 rows of training data
print(training_data.head(10))


In [None]:
pip install pandas scikit-learn xgboost catboost lightgbm matplotlib seaborn tensorflow

In [None]:
# Import necessary libraries for models and preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, auc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

# Create a function to build a DNN model for binary classification
def create_dnn_model(input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))  # Define the input layer using the Input function
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Initialize a list to hold models with preprocessing steps (imputation + scaling)
models = {
    'Neural Network': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', MLPClassifier(max_iter=500))
    ]),
    'Random Forest': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', RandomForestClassifier())
    ]),
    'Logistic Regression': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=500))
    ]),
    'XGBoost': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ]),
    'CatBoost': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', CatBoostClassifier(silent=True))
    ]),
    'LightGBM': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', LGBMClassifier())
    ]),
    'SVM': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),  # SVM performs better with scaled data
        ('classifier', SVC(probability=True))  # Enable probability estimates for AUC-ROC
    ]),
    'K-Nearest Neighbors': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),  # KNN benefits from scaling
        ('classifier', KNeighborsClassifier())
    ]),
    'Naive Bayes': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', GaussianNB())  # Naive Bayes does not need scaling
    ]),
    'AdaBoost': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', AdaBoostClassifier())
    ]),
    'Decision Tree': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', DecisionTreeClassifier())
    ]),
    'DNN': create_dnn_model(input_dim=X_train.shape[1])  # Assuming your features are preprocessed
}

# Step 6: Model Building and Training
results = {}

for model_name, model in models.items():
    if model_name == 'DNN':
        # Special handling for DNN model (sequential model instead of sklearn pipeline)
        model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)
        y_pred = (model.predict(X_test) > 0.5).astype(int)  # Convert probabilities to binary class (0 or 1)
    else:
        model.fit(X_train, y_train)  # Train the model
        y_pred = model.predict(X_test)  # Make predictions

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # AUC-ROC Curve
    if model_name == 'DNN':  # Special handling for DNN predict_proba
        y_pred_proba = model.predict(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
    elif hasattr(model.named_steps['classifier'], 'predict_proba'):  # Ensure model supports predict_proba
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
    else:
        roc_auc, fpr, tpr = None, None, None

    # Store results
    results[model_name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'confusion_matrix': cm,
        'roc_auc': roc_auc,
        'fpr': fpr,
        'tpr': tpr
    }

    # Print results
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{cm}\n")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Step 7: Plotting AUC-ROC Curves
plt.figure(figsize=(10, 8))

for model_name, metrics in results.items():
    plt.plot(metrics['fpr'], metrics['tpr'], label=f'{model_name} (AUC = {metrics["roc_auc"]:.2f})')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid()
plt.show()