<a href="https://colab.research.google.com/github/Remonah-3/Github_Assignment/blob/master/Credit_Information_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

train = pd.read_csv("application_train.csv")

# Map binary categorical features to numeric
train['OWN_CAR_NUM'] = train['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})
train['OWN_REALTY_NUM'] = train['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1})

# Select numeric columns only
numeric_cols = train.select_dtypes(include='number').columns.tolist()
numeric_cols.remove('TARGET')

# Fill missing values with 0
train[numeric_cols] = train[numeric_cols].fillna(0)

# Train-validation split
X = train[numeric_cols]
y = train['TARGET']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train baseline Logistic Regression model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Evaluate on validation set using ROC AUC
val_probs = model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, val_probs)
print("Baseline ROC AUC (validation):", roc_auc)


Baseline ROC AUC (validation): 0.6274084032846746


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

train = pd.read_csv("application_train.csv")
test = pd.read_csv("application_test.csv")  # test set for submission

# Map binary categorical features to numeric
train['OWN_CAR_NUM'] = train['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})
train['OWN_REALTY_NUM'] = train['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1})
test['OWN_CAR_NUM'] = test['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})
test['OWN_REALTY_NUM'] = test['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1})

numeric_cols = train.select_dtypes(include='number').columns.tolist()
numeric_cols.remove('TARGET')

# Fill missing values with 0
train[numeric_cols] = train[numeric_cols].fillna(0)
test[numeric_cols] = test[numeric_cols].fillna(0)

# Train-validation split
X = train[numeric_cols]
y = train['TARGET']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train baseline Logistic Regression model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Evaluate on validation set (optional, just for feedback)
val_probs = model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, val_probs)
print("Baseline ROC AUC (validation):", roc_auc)

# Predict probabilities on test set
X_test = test[numeric_cols]
test_probs = model.predict_proba(X_test)[:, 1]

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Baseline ROC AUC (validation): 0.6274084032846746
Submission file created: baseline_submission.csv


In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv("application_train.csv")
test = pd.read_csv("application_test.csv")

# preprocessing
def preprocess(df):
    df = df.copy()

    df['OWN_CAR_NUM'] = df['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})
    df['OWN_REALTY_NUM'] = df['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1})

    # Fill numeric missing values with 0
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    for col in numeric_cols:
        df[col] = df[col].fillna(0)
    return df

train = preprocess(train)
test = preprocess(test)

# feature engineering patterns
patterns = {}

# Pattern 1: Baseline numeric features
numeric_cols = train.select_dtypes(include='number').columns.tolist()
numeric_cols.remove('TARGET')
patterns['Pattern 1'] = numeric_cols

# Pattern 2: Add debt ratio (AMT_CREDIT / AMT_INCOME_TOTAL)
train['DEBT_RATIO'] = train['AMT_CREDIT'] / (train['AMT_INCOME_TOTAL'] + 1)
test['DEBT_RATIO'] = test['AMT_CREDIT'] / (test['AMT_INCOME_TOTAL'] + 1)
patterns['Pattern 2'] = numeric_cols + ['DEBT_RATIO']

# Pattern 3: Add external sources
for col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)
patterns['Pattern 3'] = patterns['Pattern 2'] + ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

# Pattern 4: Add age in years
train['AGE_YEARS'] = train['DAYS_BIRTH'] / -365
test['AGE_YEARS'] = test['DAYS_BIRTH'] / -365
patterns['Pattern 4'] = patterns['Pattern 3'] + ['AGE_YEARS']

# Pattern 5: Include one-hot encoding for selected categorical features
categorical_features = ['CODE_GENDER', 'NAME_CONTRACT_TYPE']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_ohe = pd.DataFrame(ohe.fit_transform(train[categorical_features]), columns=ohe.get_feature_names_out())
test_ohe = pd.DataFrame(ohe.transform(test[categorical_features]), columns=ohe.get_feature_names_out())
train = pd.concat([train.reset_index(drop=True), train_ohe], axis=1)
test = pd.concat([test.reset_index(drop=True), test_ohe], axis=1)
patterns['Pattern 5'] = patterns['Pattern 4'] + list(train_ohe.columns)

# Train and validate each pattern
results = {}
for name, features in patterns.items():
    X = train[features]
    y = train['TARGET']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LogisticRegression(max_iter=5000)
    model.fit(X_train, y_train)

    val_probs = model.predict_proba(X_val)[:, 1]
    roc_auc = roc_auc_score(y_val, val_probs)
    results[name] = roc_auc
    print(f"{name} - Validation ROC AUC: {roc_auc:.4f}")

# Select best pattern
best_pattern = max(results, key=results.get)
print(f"\nBest performing pattern: {best_pattern} with ROC AUC: {results[best_pattern]:.4f}")
best_features = patterns[best_pattern]

# Train model on full training set using best pattern
final_model = LogisticRegression(max_iter=1000)
final_model.fit(train[best_features], train['TARGET'])

# Predict on test set
test_probs = final_model.predict_proba(test[best_features])[:, 1]

  train['DEBT_RATIO'] = train['AMT_CREDIT'] / (train['AMT_INCOME_TOTAL'] + 1)
  test['DEBT_RATIO'] = test['AMT_CREDIT'] / (test['AMT_INCOME_TOTAL'] + 1)
  train['AGE_YEARS'] = train['DAYS_BIRTH'] / -365
  test['AGE_YEARS'] = test['DAYS_BIRTH'] / -365
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pattern 1 - Validation ROC AUC: 0.6274


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pattern 2 - Validation ROC AUC: 0.6274


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pattern 3 - Validation ROC AUC: 0.6279


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pattern 4 - Validation ROC AUC: 0.6280


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pattern 5 - Validation ROC AUC: 0.6282

Best performing pattern: Pattern 5 with ROC AUC: 0.6282


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Submission file created: feature_engineered_submission.csv
