In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
file_name = 'LoanExport.csv'
loan_data = pd.read_csv(file_name, low_memory=False)

In [3]:
loan_data = loan_data.drop(columns=['SellerName'])

In [4]:
numerical_features = loan_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = loan_data.select_dtypes(include=['object', 'bool']).columns.tolist()


In [5]:
numerical_features.remove('EverDelinquent') if 'EverDelinquent' in numerical_features else None
categorical_features.remove('EverDelinquent') if 'EverDelinquent' in categorical_features else None


In [6]:
discrete_columns = ['CreditScore', 'DTI', 'OrigLoanTerm']

In [7]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


In [8]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [9]:

discrete_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('binning', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')),
    ('scaler', StandardScaler())
])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, [col for col in numerical_features if col not in discrete_columns]),
        ('discrete', discrete_transformer, discrete_columns),
        ('cat', categorical_transformer, categorical_features)
    ])


In [11]:
X = loan_data.drop(columns=['EverDelinquent'])
y = loan_data['EverDelinquent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(max_iter=2000, C=0.01, penalty='l2'))])


In [13]:
cv = StratifiedKFold(n_splits=10)
cross_val_scores = cross_val_score(model, X_train, y_train, cv=cv)
print("Cross-validation scores:", cross_val_scores)
print("Mean CV score:", cross_val_scores.mean())


Cross-validation scores: [0.9251158  0.92267113 0.92400069 0.92438669 0.92340024 0.92494424
 0.92421513 0.92593069 0.92352891 0.92378624]
Mean CV score: 0.9241979756390462


In [14]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

classification_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Classification Report:\n", classification_report)
print("Accuracy:", accuracy)

Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     46767
           1       1.00      0.63      0.77     11524

    accuracy                           0.93     58291
   macro avg       0.96      0.81      0.86     58291
weighted avg       0.93      0.93      0.92     58291

Accuracy: 0.9260606268549175


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
file_name = 'LoanExport.csv'
loan_data = pd.read_csv(file_name, low_memory=False)

# Drop the SellerName column
loan_data = loan_data.drop(columns=['SellerName'])

# Identify numerical and categorical columns
numerical_features = loan_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = loan_data.select_dtypes(include=['object', 'bool']).columns.tolist()

# Remove target variable from features
numerical_features.remove('EverDelinquent') if 'EverDelinquent' in numerical_features else None
categorical_features.remove('EverDelinquent') if 'EverDelinquent' in categorical_features else None

# Identify discrete columns
discrete_columns = ['CreditScore', 'DTI', 'OrigLoanTerm']

# Handle missing values and preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

discrete_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('binning', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')),
    ('scaler', StandardScaler())
])

# Bundle preprocessing for numerical, discrete, and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, [col for col in numerical_features if col not in discrete_columns]),
        ('discrete', discrete_transformer, discrete_columns),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the data into training and testing sets
X = loan_data.drop(columns=['EverDelinquent'])
y = loan_data['EverDelinquent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the SVM model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', SVC(kernel='linear', C=1.0))])

# Use StratifiedKFold for cross-validation to maintain class distribution
cv = StratifiedKFold(n_splits=10)
cross_val_scores = cross_val_score(model, X_train, y_train, cv=cv)
print("Cross-validation scores:", cross_val_scores)
print("Mean CV score:", cross_val_scores.mean())

# Train the model and evaluate
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

classification_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Classification Report:\n", classification_report)
print("Accuracy:", accuracy)
