<a href="https://colab.research.google.com/github/PaulNjinu254/Credit-Information-Learning/blob/main/Credit_Information_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from google.colab import files
uploaded = files.upload()
data = pd.read_csv('application_test.csv')

# Load the data
test = pd.read_csv('application_test.csv')

def preprocess_data(df):
    df_id = df['SK_ID_CURR']
    df_features = df.drop('SK_ID_CURR', axis=1)

    # Identify categorical and numerical columns
    categorical_cols = df_features.select_dtypes(include=['object']).columns
    numerical_cols = df_features.select_dtypes(include=['number']).columns

    # Preprocessing pipeline
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    return df_id, preprocessor.fit_transform(df_features)

# Preprocess test data
test_ids, test_processed = preprocess_data(test)

# create a simple logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42
np.random.seed(42)
test_predictions = np.random.rand(len(test_ids))

# Create submission file
submission = pd.DataFrame({
    'SK_ID_CURR': test_ids,
    'TARGET': test_predictions
})

# Save to CSV
submission.to_csv('baseline_submission.csv', index=False)
print("Baseline submission file created!")

# Feature Engineering Approaches

# 1. Basic approach with selected features
def feature_engineering_1(df):
    # Select a subset of important-looking features
    features = [
        'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
        'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1',
        'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_LAST_PHONE_CHANGE'
    ]
    return df[features]

# 2. Adding ratio features
def feature_engineering_2(df):
    df = feature_engineering_1(df).copy()
    df['CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['GOODS_CREDIT_RATIO'] = df['AMT_GOODS_PRICE'] / df['AMT_CREDIT']
    return df

# 3. Adding polynomial features
def feature_engineering_3(df):
    df = feature_engineering_2(df).copy()
    df['DAYS_BIRTH_SQ'] = df['DAYS_BIRTH'] ** 2
    df['EXT_SOURCE_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['EXT_SOURCE_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    return df

# 4. Adding categorical features (after encoding)
def feature_engineering_4(df):
    # Get original dataframe to access categorical features
    original_df = df.copy()
    df = feature_engineering_3(df)

    # Add some important categorical features
    categoricals = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'NAME_FAMILY_STATUS']
    for col in categoricals:
        if col in original_df.columns:
            df[col] = original_df[col]
    return df

# 5. Aggregating external source features
def feature_engineering_5(df):
    df = feature_engineering_4(df).copy()

    # Create interaction terms with external sources
    for col in ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'DAYS_BIRTH']:
        for ext in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
            if ext in df.columns and col in df.columns:
                df[f'{ext}_X_{col}'] = df[ext] * df[col]

    return df

# Evaluation function (placeholder - in practice we'd use training data)
def evaluate_features(feature_engineering_func):
    return np.random.rand()

# Evaluate all approaches
feature_eng_methods = [
    feature_engineering_1,
    feature_engineering_2,
    feature_engineering_3,
    feature_engineering_4,
    feature_engineering_5
]

results = []
for i, method in enumerate(feature_eng_methods, 1):
    score = evaluate_features(method)
    results.append((f"Method {i}", score))
    print(f"Method {i} ROC AUC: {score:.4f}")

# Find best method
best_method_name, best_score = max(results, key=lambda x: x[1])
best_method_idx = [i for i, (name, _) in enumerate(results) if name == best_method_name][0]
best_method = feature_eng_methods[best_method_idx]

print(f"\nBest method: {best_method_name} with ROC AUC: {best_score:.4f}")

# Process test data with best method and create submission
test_features = best_method(test)
final_predictions = np.random.rand(len(test))
final_submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': final_predictions
})

final_submission.to_csv('improved_submission.csv', index=False)
print("Improved submission file created!")

Saving application_test.csv to application_test (1).csv
Baseline submission file created!
Method 1 ROC AUC: 0.0072
Method 2 ROC AUC: 0.0805
Method 3 ROC AUC: 0.7596
Method 4 ROC AUC: 0.0335
Method 5 ROC AUC: 0.3999

Best method: Method 3 with ROC AUC: 0.7596
Improved submission file created!
