<a href="https://colab.research.google.com/github/Remonah-3/Github_Assignment/blob/master/Credit_Information_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# Load data
train = pd.read_csv("application_train.csv")

# Map binary categorical features to numeric
train['OWN_CAR_NUM'] = train['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})
train['OWN_REALTY_NUM'] = train['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1})

# Select numeric columns only
numeric_cols = train.select_dtypes(include='number').columns.tolist()
numeric_cols.remove('TARGET')

# Fill missing numeric values with 0
train[numeric_cols] = train[numeric_cols].fillna(0)

# Train-validation split
X = train[numeric_cols]
y = train['TARGET']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train baseline Logistic Regression model
model = LogisticRegression(solver='lbfgs', max_iter=3000, C=0.3, n_jobs=-1)
model.fit(X_train_scaled, y_train)

# Evaluate on validation set using ROC AUC
val_probs = model.predict_proba(X_val_scaled)[:, 1]
roc_auc = roc_auc_score(y_val, val_probs)
print("Baseline ROC AUC (validation):", roc_auc)


Baseline ROC AUC (validation): 0.722143184812277


In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# Load data
train = pd.read_csv("application_train.csv")
test = pd.read_csv("application_test.csv")

# Map binary categorical features to numeric
for df in [train, test]:
    df['OWN_CAR_NUM'] = df['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})
    df['OWN_REALTY_NUM'] = df['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1})

# Select numeric columns only
numeric_cols = train.select_dtypes(include='number').columns.tolist()
numeric_cols.remove('TARGET')

# Fill missing numeric values with 0
train[numeric_cols] = train[numeric_cols].fillna(0)
test[numeric_cols] = test[numeric_cols].fillna(0)

# Train-validation split
X = train[numeric_cols]
y = train['TARGET']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional subsample for faster runtime in Colab
X_train, y_train = X_train[:20000], y_train[:20000]
X_val, y_val = X_val[:5000], y_val[:5000]

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test[numeric_cols])

# Train Logistic Regression
model = LogisticRegression(solver='lbfgs', max_iter=3000, C=0.3, n_jobs=-1)
model.fit(X_train_scaled, y_train)

# Evaluate on validation set
val_probs = model.predict_proba(X_val_scaled)[:, 1]
roc_auc = roc_auc_score(y_val, val_probs)
print(f"Baseline ROC AUC (validation): {roc_auc:.4f}")

# Predict on test set
test_probs = model.predict_proba(X_test_scaled)[:, 1]

# Create submission CSV
submission = pd.DataFrame({'SK_ID_CURR': test['SK_ID_CURR'], 'TARGET': test_probs})
submission.to_csv("submission_baseline.csv", index=False)
print("submission_baseline.csv created.")


Baseline ROC AUC (validation): 0.7139
submission_baseline.csv created.


In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score

train = pd.read_csv("application_train.csv")
test = pd.read_csv("application_test.csv")

# Preprocess
for df in [train, test]:
    df['OWN_CAR_NUM'] = df['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})
    df['OWN_REALTY_NUM'] = df['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1})
    numeric = df.select_dtypes(include='number').columns.tolist()
    df[numeric] = df[numeric].fillna(0)

# Base numeric features
numeric_cols = train.select_dtypes(include='number').columns.tolist()
if 'TARGET' in numeric_cols:
    numeric_cols.remove('TARGET')
corrs = train[numeric_cols + ['TARGET']].corr()['TARGET'].abs().drop('TARGET')
base_K = 30
base_features = corrs.sort_values(ascending=False).head(base_K).index.tolist()

for core in ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH']:
    if core in train.columns and core not in base_features:
        base_features.append(core)

patterns = {}
patterns['Pattern 1'] = base_features.copy()

# Pattern 2
train = pd.concat([train, pd.DataFrame({'DEBT_RATIO': train['AMT_CREDIT'] / (train['AMT_INCOME_TOTAL'] + 1)})], axis=1)
test = pd.concat([test, pd.DataFrame({'DEBT_RATIO': test['AMT_CREDIT'] / (test['AMT_INCOME_TOTAL'] + 1)})], axis=1)
patterns['Pattern 2'] = patterns['Pattern 1'] + ['DEBT_RATIO']

# Pattern 3
for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
    if col not in train.columns:
        train[col] = 0
        test[col] = 0
patterns['Pattern 3'] = patterns['Pattern 2'] + ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

# Pattern 4
train = pd.concat([train, pd.DataFrame({'AGE_YEARS': train['DAYS_BIRTH'] / -365.25})], axis=1)
test = pd.concat([test, pd.DataFrame({'AGE_YEARS': test['DAYS_BIRTH'] / -365.25})], axis=1)
patterns['Pattern 4'] = patterns['Pattern 3'] + ['AGE_YEARS']

# Pattern 5
categorical = ['CODE_GENDER', 'NAME_CONTRACT_TYPE']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_ohe = pd.DataFrame(ohe.fit_transform(train[categorical]), columns=ohe.get_feature_names_out(), index=train.index)
test_ohe = pd.DataFrame(ohe.transform(test[categorical]), columns=ohe.get_feature_names_out(), index=test.index)
train = pd.concat([train.reset_index(drop=True), train_ohe.reset_index(drop=True)], axis=1)
test = pd.concat([test.reset_index(drop=True), test_ohe.reset_index(drop=True)], axis=1)
patterns['Pattern 5'] = patterns['Pattern 4'] + list(train_ohe.columns)

# Evaluate each pattern
results = {}
TRAIN_SAMPLE_SIZE = 90000
for name, feats in patterns.items():
    X = train[feats]
    y = train['TARGET']

    if len(X) > TRAIN_SAMPLE_SIZE:
        idx = train.sample(n=TRAIN_SAMPLE_SIZE, random_state=42).index
        X = X.loc[idx]
        y = y.loc[idx]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    model = LogisticRegression(solver='saga', penalty='l2', C=0.5, max_iter=1000, tol=1e-4, n_jobs=-1)
    model.fit(X_train_scaled, y_train)

    val_probs = model.predict_proba(X_val_scaled)[:, 1]
    roc = roc_auc_score(y_val, val_probs)
    results[name] = {'roc': roc, 'features': feats}
    print(f"{name}: ROC AUC = {roc:.4f}  (features: {len(feats)})")

# Select best pattern
best_name = max(results, key=lambda k: results[k]['roc'])
best_feats = results[best_name]['features']
print(f"\nBest pattern: {best_name}  =>  ROC AUC = {results[best_name]['roc']:.4f}")

# Train final model
FINAL_TRAIN_SIZE = 140000
X_full = train[best_feats]
y_full = train['TARGET']
if len(X_full) > FINAL_TRAIN_SIZE:
    idx = train.sample(n=FINAL_TRAIN_SIZE, random_state=7).index
    X_full = X_full.loc[idx]
    y_full = y_full.loc[idx]

scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)
X_test_final = test[best_feats].fillna(0)
X_test_scaled = scaler.transform(X_test_final)

final_model = LogisticRegression(solver='saga', penalty='l2', C=0.5, max_iter=1000, tol=1e-4, n_jobs=-1)
final_model.fit(X_full_scaled, y_full)

test_probs = final_model.predict_proba(X_test_scaled)[:, 1]
submission = pd.DataFrame({'SK_ID_CURR': test['SK_ID_CURR'], 'TARGET': test_probs})
submission.to_csv("submission_best_pattern.csv", index=False)
print("submission_best_pattern.csv created")




Pattern 1: ROC AUC = 0.7041  (features: 33)




Pattern 2: ROC AUC = 0.7045  (features: 34)




Pattern 3: ROC AUC = 0.7045  (features: 37)




Pattern 4: ROC AUC = 0.7045  (features: 38)




Pattern 5: ROC AUC = 0.7088  (features: 43)

Best pattern: Pattern 5  =>  ROC AUC = 0.7088
submission_best_pattern.csv created


