In [131]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/debt-default-prediction/X_test.csv
/kaggle/input/debt-default-prediction/DataDictionary.xlsx
/kaggle/input/debt-default-prediction/valid.csv
/kaggle/input/debt-default-prediction/train.csv


# Train dataset is read and the rows with missing target value is removed from the dataset.

In [None]:
X_train = pd.read_csv('/kaggle/input/debt-default-prediction/train.csv')
X_train.dropna(axis=0, subset=['loan_status'], inplace=True)
X_train.head()
y_train = X_train.loan_status
X_train.drop(['loan_status'], axis = 1,inplace= True )


In [None]:
print(X_train.shape)
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column >0])

In [None]:
num_cols_with_missing = (missing_val_count_by_column >50000).sum()
num_cols_with_missing

In [None]:
# Identify columns with more than 50,000 missing values
cols_to_drop = missing_val_count_by_column[missing_val_count_by_column > 50000].index

# Ensure that the columns to drop actually exist in X_train
cols_to_drop = [col for col in cols_to_drop if col in X_train.columns]

# Drop columns from X_train
X_train.drop(cols_to_drop, axis=1, inplace=True)

# Print the shape of X_train after dropping columns
print(X_train.shape)

In [None]:
X_valid = pd.read_csv('/kaggle/input/debt-default-prediction/valid.csv')
X_valid.dropna(axis=0, subset=['loan_status'], inplace=True)

y_valid = X_valid.loan_status
X_valid.drop(['loan_status'], axis = 1,inplace= True )
train_columns  = X_train.columns
X_valid = X_valid[train_columns]
X_valid.shape

In [None]:
X_test = pd.read_csv('/kaggle/input/debt-default-prediction/X_test.csv')
X_test = X_test[train_columns]
X_test.shape

* **Let us handle the rest of the missing values with imputations , numerical values with the mean of the column and the object columns with the most frequent value.**

In [None]:
unique_values_counts = X_train.nunique()

# Find columns where number of unique values is equal to 1
columns_with_same_value = unique_values_counts[unique_values_counts == 1].index.tolist()

# Print columns with the same value for every entry

X_train = X_train.drop(columns=columns_with_same_value)
X_valid = X_valid.drop(columns=columns_with_same_value)
X_test = X_test.drop(columns=columns_with_same_value)
print(X_train.shape, X_valid.shape, X_test.shape)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]

categorical_cols = [cname for cname in X_train.columns if 
                    X_train[cname].dtype == 'object']

numerical_imputer = SimpleImputer(strategy='mean')
X_train_numerical = X_train[numerical_cols].copy()
X_valid_numerical = X_valid[numerical_cols].copy()
X_test_numerical = X_test[numerical_cols].copy()
X_train_numerical = numerical_imputer.fit_transform(X_train_numerical)
X_valid_numerical = numerical_imputer.transform(X_valid_numerical)
X_test_numerical = numerical_imputer.transform(X_test_numerical)


# Preprocessing for categorical data
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_train_categorical = X_train[categorical_cols].copy()
X_valid_categorical = X_valid[categorical_cols].copy()
X_test_categorical = X_test[categorical_cols].copy()
X_train_categorical = categorical_imputer.fit_transform(X_train_categorical)
X_valid_categorical = categorical_imputer.transform(X_valid_categorical)
X_test_categorical = categorical_imputer.transform(X_test_categorical)

# Convert back to DataFrame

X_train_numerical = pd.DataFrame(X_train_numerical, columns=numerical_cols)
X_train_categorical = pd.DataFrame(X_train_categorical, columns=categorical_cols)
X_valid_numerical = pd.DataFrame(X_valid_numerical, columns=numerical_cols)
X_valid_categorical = pd.DataFrame(X_valid_categorical, columns=categorical_cols)
X_test_numerical = pd.DataFrame(X_test_numerical, columns=numerical_cols)
X_test_categorical = pd.DataFrame(X_test_categorical, columns=categorical_cols)



# Now you can combine the numerical and categorical data
X_train = pd.concat([X_train_numerical, X_train_categorical], axis=1)
X_valid =  pd.concat([X_valid_numerical, X_valid_categorical], axis=1)
X_test=  pd.concat([X_test_numerical, X_test_categorical], axis=1)



In [None]:
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column >0])
missing_val_count_by_column_valid = (X_valid.isnull().sum())
print(missing_val_count_by_column_valid[missing_val_count_by_column_valid >0])
print(X_train.shape, X_valid.shape)

In [None]:
X_train[categorical_cols].head()

In [None]:
#X_train['emp_length'].unique()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
#custom_order_grade = ['A', 'B','C','D','E','F','G']
custom_order_subgrade = ['A1','A2','A3','A4','A5','B1','B2','B3','B4','B5','C1','C2','C3','C4','C5','D1','D2','D3','D4','D5','E1','E2','E3','E4','E5','F1','F2','F3','F4','F5','G1','G2','G3','G4','G5']
ordinal_encoder = OrdinalEncoder(categories=[custom_order_subgrade])
X_train['sub_grade'] = ordinal_encoder.fit_transform(X_train[['sub_grade']])


In [None]:
X_valid['sub_grade'] = ordinal_encoder.transform(X_valid[['sub_grade']]) 
X_test['sub_grade'] = ordinal_encoder.transform(X_test[['sub_grade']]) 

# custom_order_emp_length = ['< 1 year', '1 year', '2 years', '3 years','4 years',  '5 years','6 years', '7 years','8 years','9 years','10+ years']
# ordinal_encoder2 = OrdinalEncoder(categories=[custom_order_emp_length])
# X_train['emp_length'] = ordinal_encoder2.fit_transform(X_train[['emp_length']])
# X_valid['emp_length'] = ordinal_encoder2.transform(X_valid[['emp_length']])
# X_test['emp_length'] = ordinal_encoder2.transform(X_test[['emp_length']])
print('done')

In [None]:
X_valid['sub_grade'].head()

In [None]:

X_train_modified= X_train.drop(columns = ['grade'])
X_valid_modified = X_valid.drop(columns = ['grade'])
X_test_modified = X_test.drop(columns = ['grade'])# redundant feature with grade


In [None]:
X_valid_modified.shape

In [None]:
# List to store features with unequal unique values
columns_with_different_unique_values = []

categorical_cols = [cname for cname in X_train_modified.columns if 
                    X_train_modified[cname].dtype == 'object']
#print(categorical_cols)
# Iterate over each feature
for col in categorical_cols:
    # Get unique values of the feature in train and validation sets
    train_unique_values = set(X_train_modified[col].unique())
    valid_unique_values = set(X_valid_modified[col].unique())
    #print(col,X_train[col].nunique() ,X_valid[col].nunique())
    
    # Check if unique values are not equal
    if not valid_unique_values.issubset(train_unique_values):
        columns_with_different_unique_values.append(col)
        print(col, X_train_modified[col].nunique(),X_valid_modified[col].nunique())


# Print features with unequal unique values
print("Features with unequal unique values between X_train and X_valid:")
print(columns_with_different_unique_values)

In [None]:
good_label_cols = [col for col in categorical_cols if 
                   (set(X_valid_modified[col]).issubset(set(X_train_modified[col]))) and (set(X_test_modified[col]).issubset(set(X_train_modified[col])))]

bad_label_cols = list(set(categorical_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)        

The bad labels are removed since in the valid data set those features contain different labels from the train dataset

In [None]:
X_train_pre_encoding = X_train_modified.drop(bad_label_cols, axis=1)
X_valid_pre_encoding= X_valid_modified.drop(bad_label_cols, axis=1)
X_test_pre_encoding  = X_test_modified.drop(bad_label_cols, axis=1)

In [None]:
set(X_train_pre_encoding['term'].unique()) == set(X_valid_pre_encoding['term'].unique())

In [None]:
X_valid_pre_encoding['term'].unique()

In [None]:
X_train_pre_encoding['term'].unique()

In [None]:
X_test_pre_encoding['term'].unique()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
X_train_encoded = X_train_pre_encoding.copy()
X_valid_encoded = X_valid_pre_encoding.copy()
X_test_encoded = X_test_pre_encoding.copy()

print(set(X_train_encoded['term'].unique()) == set(X_valid_encoded['term'].unique()))
X_train_encoded[good_label_cols] = ordinal_encoder.fit_transform(X_train_pre_encoding[good_label_cols])
X_valid_encoded[good_label_cols] = ordinal_encoder.transform(X_valid_pre_encoding[good_label_cols])
X_test_encoded[good_label_cols] = ordinal_encoder.transform(X_test_pre_encoding[good_label_cols])



In [None]:
categorical_cols = [cname for cname in X_train_encoded.columns if X_train_encoded[cname].dtype == 'object']
categorical_cols

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_score, KFold

X_train_new  = X_train_encoded
X_valid_new = X_valid_encoded
X_test_new  = X_test_encoded

# Define and train an XGBoost classifier
model = xgb.XGBClassifier()
model.fit(X_train_new, y_train)
y_pred_initial = model.predict(X_valid_new)
accuracy_initial = accuracy_score(y_valid, y_pred_initial)
precision_initial = precision_score(y_valid, y_pred_initial)
print(f"Accuracy with initial features: {accuracy_initial}")
print(f"precision with initial features: {precision_initial}")
# Get feature importance scores
feature_importances = model.feature_importances_

# Print feature importance scores
# for i, score in enumerate(feature_importances):
#     print(f"Feature '{X_train_new.columns[i]}' importance: {score}")

# Use feature importance scores to select features
selected_features = [X_train_new.columns[i] for i, score in enumerate(feature_importances) if score > 0.00005]

# Select only the selected features for both training and validation sets
X_train_selected = X_train_new[selected_features]
X_valid_selected = X_valid_new[selected_features]
X_test_selected = X_test_new[selected_features]

# Train and evaluate the model using only selected features
model_selected = xgb.XGBClassifier()
model_selected.fit(X_train_selected, y_train)
y_pred_selected = model_selected.predict(X_valid_selected)
accuracy_selected = accuracy_score(y_valid, y_pred_selected)
precision = precision_score(y_valid, y_pred_selected)
print(f"Accuracy with selected features: {accuracy_selected}")
print(f"precision with selected features: {precision}")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model_selected, X_train_selected, y_train, cv=kf, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())


In [None]:
y_pred_train = model_selected.predict(X_train_selected)
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
print(f"Accuracy with selected features for training data: {accuracy_train}")
print(f"precision with selected features for training data: {precision_train}")


In [None]:
X_train_new.shape

Now let us try a PCA analaysis

In [None]:
print(X_test_new.shape)
X_valid_new.shape

In [None]:
X_test_new.head()

In [None]:
X_valid_new.head()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 20)
X_train_pca = pca.fit_transform(X_train_selected)
component_names  = [f"PC{i+1}" for i in range(X_train_pca.shape[1])]
X_train_pca = pd.DataFrame(X_train_pca, columns= component_names)
X_valid_pca = pca.transform(X_valid_selected)
X_valid_pca = pd.DataFrame(X_valid_pca, columns= component_names)
X_valid_pca.head() 
#X_pca.head()

In [None]:
loadings   = pd.DataFrame(pca.components_.T,
                         columns  = component_names,
                         
                         )
loadings = pd.DataFrame(pca.components_.T, columns=component_names)
loadings

In [None]:
model_with_pca = xgb.XGBClassifier()
model_with_pca.fit(X_train_pca, y_train)
y_pred_pca = model_with_pca.predict(X_valid_pca)
accuracy_pca = accuracy_score(y_valid, y_pred_pca)
precision_pca = precision_score(y_valid, y_pred_pca)
print(f"Accuracy with selected features: {accuracy_pca}")
print(f"precision with selected features: {precision_pca}")

In [None]:
y_test = model_selected.predict(X_test_selected)



In [None]:
y_test_df = pd.DataFrame(y_test, columns=['predicted_label'])
combined_df = pd.concat([y_test_df, X_test_selected], axis=1)

# Save the combined DataFrame to a CSV file
combined_df.to_csv('210670N.csv', index=False)

In [None]:
y_test_df.head()

In [None]:
combined_df.to_csv('/kaggle/working/210670N.csv', index=False)

In [None]:
from IPython.display import FileLink
FileLink(r'210670N.csv')

In [None]:
X_train_selected.shape