In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

# Load data from CSV file
file_path = r"C:\Users\HAI\Downloads\Telegram Desktop\loan_data_nov2023.csv"  # Replace with the actual path to your CSV file
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop('default', axis=1)
y = df['default']

# Define LabelEncoderTransformer
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.label_encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            le = LabelEncoder()
            le.fit(X[column])
            self.label_encoders[column] = le
        return self

    def transform(self, X):
        X = X.copy()
        for column, le in self.label_encoders.items():
            X[column] = le.transform(X[column])
        return X

# Define preprocessing pipeline
label_encoder_columns = ['grade', 'ownership']
one_hot_columns = ['grade', 'ownership']
preprocessor_label = ColumnTransformer(
    transformers=[
        ('label', LabelEncoderTransformer(columns=label_encoder_columns), label_encoder_columns),
        ('onehot', OneHotEncoder(drop='first', sparse=False), one_hot_columns),
        ('scaler', StandardScaler(), ['amount', 'interest', 'years', 'income', 'age'])
    ],
    remainder='passthrough'
)

# Define the model pipeline
model_label = Pipeline([
    ('preprocessor', preprocessor_label),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_label.fit(X_train, y_train)

# Make predictions
y_pred_label = model_label.predict(X_test)

# Evaluate the model
mse_label = mean_squared_error(y_test, y_pred_label)
print(f'Mean Squared Error: {mse_label}')


Mean Squared Error: 0.09837718916221985




In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Load data from CSV file
file_path = r"C:\Users\HAI\Downloads\Telegram Desktop\loan_data_nov2023.csv"  # Replace with the actual path to your CSV file
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop('default', axis=1)
y = df['default']

# Define preprocessing pipeline with OneHotEncoder
one_hot_columns = ['grade', 'ownership']
preprocessor_onehot = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse=False), one_hot_columns),
        ('scaler', StandardScaler(), ['amount', 'interest', 'years', 'income', 'age'])
    ],
    remainder='passthrough'
)

# Define the model pipeline
model_onehot = Pipeline([
    ('preprocessor', preprocessor_onehot),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_onehot.fit(X_train, y_train)

# Make predictions
y_pred_onehot = model_onehot.predict(X_test)

# Evaluate the model
mse_onehot = mean_squared_error(y_test, y_pred_onehot)
print(f'Mean Squared Error: {mse_onehot}')


Mean Squared Error: 0.09837260012503019


