In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Importing Libraries :**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# **BUILDING MODEL**

In [3]:
class Model:
    def __init__(self, train_data, test_data=None):
        self.train_data = train_data
        self.test_data = test_data
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.model = None
        self.scaler = StandardScaler()

    def load(self):
        """Load the dataset."""
        print("Training data loaded successfully.")
        if self.test_data is not None:
            print("Test data loaded successfully.")

    def preprocess(self):
        """Preprocess the data (split, scale, handle missing values)."""
        print("Preprocessing data...")

        # Handle datetime columns for both train and test data
        self._handle_datetime_columns(self.train_data)
        if self.test_data is not None:
            self._handle_datetime_columns(self.test_data)

        # Handle Categorical Variables: One-Hot Encoding for categorical columns in training and test data
        self.train_data = self._handle_categorical_columns(self.train_data)
        if self.test_data is not None:
            self.test_data = self._handle_categorical_columns(self.test_data)

        # Align columns between train and test data
        if self.test_data is not None:
            self.test_data = self._align_columns(self.train_data, self.test_data)

        # Split the training dataset into features (X) and target (y)
        X = self.train_data.drop(columns='loan_status')  # Assuming 'loan_status' is the target
        y = self.train_data['loan_status']

        # Train-Test split for training data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Feature Scaling: Only apply to numerical columns for training data
        numerical_columns = self.X_train.select_dtypes(include=['float64', 'int64']).columns
        self.X_train[numerical_columns] = self.scaler.fit_transform(self.X_train[numerical_columns])
        self.X_test[numerical_columns] = self.scaler.transform(self.X_test[numerical_columns])

        # Apply the same scaling to the test data if it exists
        if self.test_data is not None:
            X_test_data = self.test_data.drop(columns='loan_status')
            self.test_data[numerical_columns] = self.scaler.transform(X_test_data[numerical_columns])

        print("Preprocessing complete.")

    def _handle_datetime_columns(self, data):
        """Handle datetime columns (convert to numerical features)."""
        datetime_columns = data.select_dtypes(include=['datetime']).columns
        if len(datetime_columns) > 0:
            print(f"Found datetime columns: {datetime_columns}")
            # Convert datetime columns to numerical features (e.g., year, month, day)
            for col in datetime_columns:
                data[col] = pd.to_datetime(data[col])  # Convert to datetime if not already
                data[f'{col}_year'] = data[col].dt.year
                data[f'{col}_month'] = data[col].dt.month
                data[f'{col}_day'] = data[col].dt.day
                data.drop(columns=[col], inplace=True)  # Drop the original datetime column

    def _handle_categorical_columns(self, data):
        """Handle Categorical Variables: One-Hot Encoding for categorical columns."""
        categorical_columns = data.select_dtypes(include=['object']).columns
        print(f"Found categorical columns: {categorical_columns}")
        data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
        return data

    def _align_columns(self, train_data, test_data):
        """Align the columns of the test data to match the training data's columns."""
        train_columns = set(train_data.columns)
        test_columns = set(test_data.columns)

        # Add missing columns in the test data, fill with 0 (or any default value)
        missing_in_test = train_columns - test_columns
        for col in missing_in_test:
            test_data[col] = 0  # Add missing columns to the test data with default values

        # Ensure columns in both train and test are in the same order
        test_data = test_data[train_data.columns]
        return test_data

    def train(self):
        """Train the model."""
        raise NotImplementedError("Each model should implement its own training method.")

    def test(self):
        """Test the model and generate evaluation summary."""
        print("Testing the model on training data...")
        predictions_train = self.model.predict(self.X_test)
        print("Training Accuracy Score: ", accuracy_score(self.y_test, predictions_train))
        print("Training Classification Report: ")
        print(classification_report(self.y_test, predictions_train))

        if self.test_data is not None:
            # Make predictions on the test data
            print("Testing the model on test data...")
            X_test_data = self.test_data.drop(columns='loan_status')
            predictions_test = self.model.predict(X_test_data)
            print("Test Accuracy Score: ", accuracy_score(self.test_data['loan_status'], predictions_test))
            print("Test Classification Report: ")
            print(classification_report(self.test_data['loan_status'], predictions_test))

    def predict(self, new_data):
        """Predict using the trained model."""
        predictions = self.model.predict(new_data)
        return predictions

In [4]:
class LogisticRegressionModel(Model):
    def __init__(self, train_data, test_data=None):
        super().__init__(train_data, test_data)

    def train(self):
        """Train the Logistic Regression model."""
        print("Training Logistic Regression model...")
        self.model = LogisticRegression(max_iter=1000)  # Increase max_iter for convergence
        self.model.fit(self.X_train, self.y_train)
        print("Training complete.")

In [5]:
class RandomForestModel(Model):
    def __init__(self, train_data, test_data=None):
        super().__init__(train_data, test_data)

    def train(self):
        """Train the Random Forest model."""
        print("Training Random Forest model...")
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.model.fit(self.X_train, self.y_train)
        print("Training complete.")

In [7]:
if __name__ == "__main__":

    train_df = pd.read_excel('/content/drive/MyDrive/NBFC_Loan_Dataset/train_data.xlsx')
    test_df = pd.read_excel('/content/drive/MyDrive/NBFC_Loan_Dataset/test_data.xlsx')

    logreg_model = LogisticRegressionModel(train_df, test_df)
    rf_model = RandomForestModel(train_df, test_df)


    clf_models = {
        'Logistic Regression': logreg_model.model,
        'Random Forest': rf_model.model
    }
    # Load data, preprocess, train, test, and predict
    logreg_model.load()
    logreg_model.preprocess()
    logreg_model.train()
    logreg_model.test()

    # Also testing with the Random Forest model
    rf_model = RandomForestModel(train_df, test_df)
    rf_model.load()
    rf_model.preprocess()
    rf_model.train()
    rf_model.test()

Training data loaded successfully.
Test data loaded successfully.
Preprocessing data...
Found datetime columns: Index(['transaction_date'], dtype='object')
Found datetime columns: Index(['transaction_date'], dtype='object')
Found categorical columns: Index(['sub_grade', 'term', 'home_ownership', 'purpose', 'application_type',
       'verification_status'],
      dtype='object')
Found categorical columns: Index(['sub_grade', 'term', 'home_ownership', 'purpose', 'application_type',
       'verification_status'],
      dtype='object')
Preprocessing complete.
Training Logistic Regression model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training complete.
Testing the model on training data...
Training Accuracy Score:  0.7650938832944901
Training Classification Report: 
              precision    recall  f1-score   support

           0       0.61      0.26      0.37      5917
           1       0.78      0.94      0.86     16824

    accuracy                           0.77     22741
   macro avg       0.70      0.60      0.61     22741
weighted avg       0.74      0.77      0.73     22741

Testing the model on test data...
Test Accuracy Score:  0.6785334121821407
Test Classification Report: 
              precision    recall  f1-score   support

           0       0.59      0.35      0.44      3055
           1       0.70      0.86      0.77      5400

    accuracy                           0.68      8455
   macro avg       0.65      0.61      0.61      8455
weighted avg       0.66      0.68      0.65      8455

Training data loaded successfully.
Test data loaded successfully.
Preprocessing data...
Found categorical c

In [16]:
import pickle # Import the pickle module

for name, model in clf_models.items(): # Iterate directly through the items of the dictionary
    with open(f"{name.replace(' ', '_').lower()}_model.pkl", "wb") as f:
        pickle.dump(model, f)

The training pipeline, implemented in the model_.py script, followed a structured, class-based design to build and manage two models: Logistic Regression and Random Forest. The workflow included:

Data Loading: Efficient dataset handling was ensured.
Preprocessing: Steps such as feature scaling, encoding, and handling missing values were applied.
Training: Models were trained and optimized for performance.
Testing: Performance was evaluated using a hold-out test set and metrics like accuracy, precision, recall, and F1-score.
Prediction: Prediction functions were implemented for accurate inference on new data.
Both trained models were serialized as .pkl files for seamless future use without retraining, enabling efficient deployment.