In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler


In [None]:
# Load the dataset (Assuming the dataset is in a CSV file named 'credit_data.csv')
credit_data = pd.read_csv('credit_data.csv')


In [None]:
# Handle missing values (if any)
credit_data.dropna(inplace=True)

# Feature engineering (if needed)
# Example: Calculate debt-to-income ratio
credit_data['Debt_Income_Ratio'] = credit_data['Outstanding_Debts'] / credit_data['Income']

# Select features and target variable
features = credit_data[['Age', 'Income', 'Credit_Score', 'Debt_Income_Ratio']]
target = credit_data['Default']

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)


In [None]:
# Build the Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, predictions))


In [None]:
# Feature Importance
feature_importances = pd.Series(clf.feature_importances_, index=features.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Important Features for Default Prediction')
plt.show()


In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features (degree=2)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(features_scaled)


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(features_scaled, target)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Create individual models
rf = RandomForestClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
svc = SVC(probability=True, random_state=42)

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svc', svc)], voting='soft')
voting_clf.fit(X_train, y_train)


In [None]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(clf, features_scaled, target, cv=5)
print('Cross-Validation Scores:', cv_scores)
print('Mean CV Score:', cv_scores.mean())
