In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Reading data
train_df = pd.read_csv('credit_training.csv')
test_df = pd.read_csv('credit_testing.csv')

# Feature Engineering and Preprocessing for 'Age_Group' in Training Data
bins = [18, 30, 40, 50, 60, 100]
labels = ['18-30', '30-40', '40-50', '50-60', '60+']
train_df['Age_Group'] = pd.cut(train_df['AGE'], bins=bins, labels=labels)

# Label Encoding 'Age_Group' in Training Data
label_encoder = LabelEncoder()
train_df['Age_Group'] = label_encoder.fit_transform(train_df['Age_Group'])

# Feature Engineering and Preprocessing for 'Age_Group' in Test Data
test_df['Age_Group'] = pd.cut(test_df['AGE'], bins=bins, labels=labels)

# Label Encoding 'Age_Group' in Test Data using the same label encoder from training data
test_df['Age_Group'] = label_encoder.transform(test_df['Age_Group'])

# Model Training
X_train = train_df.drop(columns=['RESPONSE'])
y_train = train_df['RESPONSE']

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

# Preparing Test Data
X_test = test_df.drop(columns=['OBS_ID'])  # Assuming 'OBS_ID' is the ID column in test data

# Ensure test data has the same columns as the training data after preprocessing
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X_train.columns]

# Predictions
gb_preds = gb.predict(X_test)
gb_preds_train = gb.predict(X_train)

# Evaluating the model
y_true = train_df['RESPONSE']  # Assuming 'RESPONSE' is the true label in the test data

accuracy = accuracy_score(y_true, gb_preds_train)
classification_report_result = classification_report(y_true, gb_preds_train)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report_result)

# Output predictions to a CSV file
predictions_df = pd.DataFrame({'OBS_id': test_df['OBS_ID'], 'RESPONSE': gb_preds})
predictions_df.to_csv('sample_submission.csv', index=False)

# Display predictions DataFrame
print(predictions_df)

Accuracy: 0.9157142857142857
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.79      0.86       222
           1       0.91      0.97      0.94       478

    accuracy                           0.92       700
   macro avg       0.92      0.88      0.90       700
weighted avg       0.92      0.92      0.91       700

     OBS_id  RESPONSE
0       890         1
1       140         1
2       594         0
3       772         0
4       609         1
..      ...       ...
295     489         1
296     972         1
297     922         0
298      19         0
299     362         1

[300 rows x 2 columns]
