# Loan Prediction Analysis

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3c896cb1",
   "metadata": {},
   "source": [
    "# Loan Prediction Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f38c67b",
   "metadata": {},
   "source": [
    "### Dataset Information :"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19d29427",
   "metadata": {},
   "source": [
    "Dream Housing Finance company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan. Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers.\\n",
    "\\n",
    "This is a standard supervised classification task.A classification problem where we have to predict whether a loan would be approved or not. Below is the dataset attributes with description."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d18c028",
   "metadata": {},
   "source": [
    "### Import modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc9dfba3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "from matplotlib import pyplot as plt\n",
    "import matplotlib\n",
    "%matplotlib inline\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
    "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, r2_score, mean_absolute_error, mean_squared_error, roc_curve\n",
     "from sklearn.linear_model import LogisticRegression, LinearRegression\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d42f1ca6",
   "metadata": {},
   "source": [
    "### Load the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "527f3dc7",
   "metadata": {},
   "outputs": [],
    "source": [
     "try:\n",
     "    df = pd.read_csv('training_set.csv')\n",
     "except FileNotFoundError:\n",
     "    print(\"Error: The dataset file 'training_set.csv' was not found. Please provide the correct path.\")\n",
     "    exit()\n",
     "df.head()\n"
    ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e27b7803",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df76297a",
   "metadata": {},
   "outputs": [
       {
            "name": "stdout",
            "output_type": "stream",
            "text": [
                "<class 'pandas.core.frame.DataFrame'>\\n",
                "RangeIndex: 614 entries, 0 to 613\\n",
                "Data columns (total 13 columns):\\n",
                " #   Column             Non-Null Count  Dtype  \\n",
                "---  ------             --------------  -----  \\n",
                " 0   Loan_ID            614 non-null    object \\n",
                " 1   Gender             601 non-null    object \\n",
                " 2   Married            611 non-null    object \\n",
                " 3   Dependents         599 non-null    object \\n",
                " 4   Education          614 non-null    object \\n",
                " 5   Self_Employed      582 non-null    object \\n",
                " 6   ApplicantIncome    614 non-null    int64  \\n",
                " 7   CoapplicantIncome  614 non-null    float64\\n",
                " 8   LoanAmount         592 non-null    float64\\n",
                " 9   Loan_Amount_Term   600 non-null    float64\\n",
                " 10  Credit_History     564 non-null    float64\\n",
                " 11  property_Area      614 non-null    object \\n",
                " 12  loan_status        614 non-null    object \\n",
                "dtypes: float64(4), int64(1), object(8)\\n",
                "memory usage: 62.5+ KB\\n"
            ]
        }
    ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be5605cf",
   "metadata": {},
   "outputs": [
       {
            "data": {
                "text/html": [
                    "<div>\\n",
                    "<style scoped>\\n",
                    "    .dataframe tbody tr th:only-of-type {\\n",
                    "        vertical-align: middle;\\n",
                    "    }\\n",
                    "\\n",
                    "    .dataframe tbody tr th {\\n",
                    "        vertical-align: top;\\n",
                    "    }\\n",
                    "\\n",
                    "    .dataframe thead th {\\n",
                    "        text-align: right;\\n",
                    "    }\\n",
                    "</style>\\n",
                    "<table border=\\"1\\" class=\\"dataframe\\">\\n",
                    "  <thead>\\n",
                    "    <tr style=\\"text-align: right;\\">\\n",
                    "      <th></th>\\n",
                    "      <th>applicantincome</th>\\n",
                    "      <th>coapplicantincome</th>\\n",
                    "      <th>loanamount</th>\\n",
                    "      <th>loan_amount_term</th>\\n",
                    "      <th>credit_history</th>\\n",
                    "    </tr>\\n",
                    "  </thead>\\n",
                    "  <tbody>\\n",
                    "    <tr>\\n",
                    "      <th>count</th>\\n",
                    "      <td>614.000000</td>\\n",
                    "      <td>614.000000</td>\\n",
                    "      <td>592.000000</td>\\n",
                    "      <td>600.00000</td>\\n",
                    "      <td>564.000000</td>\\n",
                    "    </tr>\\n",
                    "    <tr>\\n",
                    "      <th>mean</th>\\n",
                    "      <td>5403.459283</td>\\n",
                    "      <td>1621.245798</td>\\n",
                    "      <td>146.412162</td>\\n",
                    "      <td>342.00000</td>\\n",
                    "      <td>0.842199</td>\\n",
                    "    </tr>\\n",
                    "    <tr>\\n",
                    "      <th>std</th>\\n",
                    "      <td>6109.041673</td>\\n",
                    "      <td>2926.248369</td>\\n",
                    "      <td>85.587325</td>\\n",
                    "      <td>65.12041</td>\\n",
                    "      <td>0.364878</td>\\n",
                    "    </tr>\\n",
                    "    <tr>\\n",
                    "      <th>min</th>\\n",
                    "      <td>150.000000</td>\\n",
                    "      <td>0.000000</td>\\n",
                    "      <td>9.000000</td>\\n",
                    "      <td>12.00000</td>\\n",
                    "      <td>0.000000</td>\\n",
                    "    </tr>\\n",
                    "    <tr>\\n",
                    "      <th>25%</th>\\n",
                    "      <td>2877.500000</td>\\n",
                    "      <td>0.000000</td>\\n",
                    "      <td>100.000000</td>\\n",
                    "      <td>360.00000</td>\\n",
                    "      <td>1.000000</td>\\n",
                    "    </tr>\\n",
                    "    <tr>\\n",
                    "      <th>50%</th>\\n",
                    "      <td>3812.500000</td>\\n",
                    "      <td>1188.500000</td>\\n",
                    "      <td>128.000000</td>\\n",
                    "      <td>360.00000</td>\\n",
                    "      <td>1.000000</td>\\n",
                    "    </tr>\\n",
                    "    <tr>\\n",
                    "      <th>75%</th>\\n",
                    "      <td>5795.000000</td>\\n",
                    "      <td>2297.250000</td>\\n",
                    "      <td>168.000000</td>\\n",
                    "      <td>360.00000</td>\\n",
                    "      <td>1.000000</td>\\n",
                    "    </tr>\\n",
                    "    <tr>\\n",
                    "      <th>max</th>\\n",
                    "      <td>81000.000000</td>\\n",
                    "      <td>41667.000000</td>\\n",
                    "      <td>700.000000</td>\\n",
                    "      <td>480.00000</td>\\n",
                    "      <td>1.000000</td>\\n",
                    "    </tr>\\n",
                    "  </tbody>\\n",
                    "</table>\\n",
                    "</div>"
                ],
                "text/plain": [
                    "       applicantincome  coapplicantincome  loanamount  loan_amount_term  \\\n",
                    "count      614.000000         614.000000  592.000000         600.00000   \n",
                    "mean      5403.459283        1621.245798  146.412162         342.00000   \n",
                    "std       6109.041673        2926.248369   85.587325          65.12041   \n",
                    "min        150.000000           0.000000    9.000000          12.00000   \n",
                    "25%       2877.500000           0.000000  100.000000         360.00000   \n",
                    "50%       3812.500000        1188.500000  128.000000         360.00000   \n",
                    "75%       5795.000000        2297.250000  168.000000         360.00000   \n",
                    "max      81000.000000       41667.000000  700.000000         480.00000   \n",
                    "\n",
                    "       credit_history  \n",
                    "count      564.000000  \n",
                    "mean         0.842199  \n",
                    "std          0.364878  \n",
                    "min          0.000000  \n",
                    "25%          1.000000  \n",
                    "50%          1.000000  \n",
                    "75%          1.000000  \n",
                    "max          1.000000  "
                ]
            },
            "execution_count": 5,
            "metadata": {},
            "output_type": "execute_result"
        }
    ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "425e50e3",
   "metadata": {},
   "source": [
    "## Preprocessing the dataset"
   ]
  },
   {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d62c653",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "loan_id              0\n",
       "gender               13\n",
       "married               3\n",
       "dependents           15\n",
       "education             0\n",
       "self_employed        32\n",
       "applicantincome       0\n",
       "coapplicantincome     0\n",
       "loanamount           22\n",
       "loan_amount_term     14\n",
       "credit_history       50\n",
       "property_Area        0\n",
       "loan_status           0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# finding null values\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a49ed3fa",
   "metadata": {},
   "outputs": [],
   "source": [
     "def handle_missing_values(data):\n",
    "    # Replace '3+' with 3 and convert to numeric, replacing NaNs with 0\n",
    "    if 'Dependents' in data.columns:\n",
    "        data['Dependents'] = data['Dependents'].replace('3+', '3').fillna(0).astype(int)\n",
    "\n",
    "    # Impute missing values for categorical features with the most frequent value (mode)\n",
    "    categorical_columns = ['Self_Employed', 'Gender', 'Married', 'Education', 'property_Area']\n",
    "    for col in categorical_columns:\n",
    "        if col in data.columns:\n",
    "            data[col] = data[col].fillna(data[col].mode()[0])\n",
    "\n",
    "    # Impute missing values for numerical features with the median\n",
    "    numerical_columns = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']\n",
    "    for col in numerical_columns:\n",
    "       if col in data.columns:\n",
    "          data[col] = data[col].fillna(data[col].median())\n",
    "    return data\n",
    "\n",
    "df = handle_missing_values(df)"
   ]
  },
    {
   "cell_type": "code",
   "execution_count": null,
   "id": "893fe650",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "loan_id              0\n",
       "gender               0\n",
       "married              0\n",
       "dependents           0\n",
       "education            0\n",
       "self_employed        0\n",
       "applicantincome       0\n",
       "coapplicantincome     0\n",
       "loanamount           0\n",
       "loan_amount_term     0\n",
       "credit_history       0\n",
       "property_Area        0\n",
       "loan_status          0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().sum()"
   ]
  },
   {
   "cell_type": "code",
   "execution_count": null,
   "id": "2218c970",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['dependents'] = df['dependents'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "810bb41a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>loan_id</th>\n",
       "      <th>gender</th>\n",
       "      <th>married</th>\n",
       "      <th>dependents</th>\n",
       "      <th>education</th>\n",
       "      <th>self_employed</th>\n",
       "      <th>applicantincome</th>\n",
       "      <th>coapplicantincome</th>\n",
       "      <th>loanamount</th>\n",
       "      <th>loan_amount_term</th>\n",
       "      <th>credit_history</th>\n",
       "      <th>property_area</th>\n",
       "      <th>loan_status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LP001002</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>5849</td>\n",
       "      <td>0.0</td>\n",
       "      <td>146.412162</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LP001003</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>1</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>4583</td>\n",
       "      <td>1508.0</td>\n",
       "      <td>128.000000</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Rural</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LP001005</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>Yes</td>\n",
       "      <td>3000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>66.000000</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LP001006</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>Not Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>2583</td>\n",
       "      <td>2358.0</td>\n",
       "      <td>120.000000</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>LP001008</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>6000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>141.000000</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>y</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    loan_id gender married  dependents     education self_employed  \\\n",
       "0  LP001002   Male      No           0      Graduate            No   \n",
       "1  LP001003   Male     Yes           1      Graduate            No   \n",
       "2  LP001005   Male     Yes           0      Graduate           Yes   \n",
       "3  LP001006   Male     Yes           0  Not Graduate            No   \n",
       "4  LP001008   Male      No           0      Graduate            No   \n",
       "\n",
       "   applicantincome  coapplicantincome  loanamount  loan_amount_term  \\\n",
       "0             5849                0.0  146.412162             360.0   \n",
       "1             4583             1508.0  128.000000             360.0   \n",
       "2             3000                0.0   66.000000             360.0   \n",
       "3             2583             2358.0  120.000000             360.0   \n",
       "4             6000                0.0  141.000000             360.0   \n",
       "\n",
       "   credit_history property_Area loan_status  \n",
       "0             1.0         Urban           y  \n",
       "1             1.0         Rural           n  \n",
       "2             1.0         Urban           y  \n",
       "3             1.0         Urban           y  \n",
       "4             1.0         Urban           y  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b95515f8",
   "metadata": {},
   "source": [
    "### Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f33a4171",
   "metadata": {},
   "outputs": [],
   "source": [
        "df['applicantincome_log'] = np.log(df['ApplicantIncome']+1)\n",
        "df['loanamount_log'] = np.log(df['LoanAmount']+1)\n",
        "df['loan_amount_term_log'] = np.log(df['Loan_Amount_Term']+1)\n",
        "df['total_income_log'] = np.log(df['ApplicantIncome'] + df['CoapplicantIncome'] + 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e74ed00f",
   "metadata": {},
   "outputs": [],
   "source": [
        "cols = ['ApplicantIncome', 'CoapplicantIncome','LoanAmount','Loan_Amount_Term','total_income', 'loan_id']\n",
        "df = df.drop(columns = cols, axis=1)"
   ]
  },
   {
   "cell_type": "code",
   "execution_count": null,
   "id": "080b10af",
   "metadata": {},
   "outputs": [
     {
        "data": {
            "text/html": [
               "<div>\\n",
                "<style scoped>\\n",
                "    .dataframe tbody tr th:only-of-type {\\n",
                "        vertical-align: middle;\\n",
               "    }\\n",
                "\\n",
                "    .dataframe tbody tr th {\\n",
                "        vertical-align: top;\\n",
                "    }\\n",
                "\\n",
                "    .dataframe thead th {\\n",
                "        text-align: right;\\n",
                "    }\\n",
                "</style>\\n",
                "<table border=\\"1\\" class=\\"dataframe\\">\\n",
                "  <thead>\\n",
               "    <tr style=\\"text-align: right;\\">\\n",
                "      <th></th>\\n",
               "      <th>gender</th>\\n",
               "      <th>married</th>\\n",
                "      <th>dependents</th>\\n",
                "      <th>education</th>\\n",
                "      <th>self_employed</th>\\n",
                "      <th>credit_history</th>\\n",
                "      <th>property_Area</th>\\n",
                "      <th>loan_status</th>\\n",
               "      <th>applicantincome_log</th>\\n",
               "      <th>loanamount_log</th>\\n",
               "      <th>loan_amount_term_log</th>\\n",
                "      <th>total_income_log</th>\\n",
                "    </tr>\\n",
                "  </thead>\\n",
                "  <tbody>\\n",
               "    <tr>\\n",
               "      <th>0</th>\\n",
                "      <td>Male</td>\\n",
                "      <td>No</td>\\n",
                "      <td>0</td>\\n",
               "      <td>Graduate</td>\\n",
                "      <td>No</td>\\n",
                "      <td>1.0</td>\\n",
                "      <td>Urban</td>\\n",
                "      <td>y</td>\\n",
                "      <td>8.674197</td>\\n",
                "      <td>4.993232</td>\\n",
                "      <td>5.888878</td>\\n",
               "      <td>8.674197</td>\\n",
                "    </tr>\\n",
                "    <tr>\\n",
                "      <th>1</th>\\n",
                "      <td>Male</td>\\n",
                "      <td>Yes</td>\\n",
                "      <td>1</td>\\n",
                "      <td>Graduate</td>\\n",
                "      <td>No</td>\\n",
               "      <td>1.0</td>\\n",
                "      <td>Rural</td>\\n",
                "      <td>n</td>\\n",
                "      <td>8.430327</td>\\n",
                "      <td>4.859812</td>\\n",
                "      <td>5.888878</td>\\n",
               "      <td>8.714732</td>\\n",
                "    </tr>\\n",
               "    <tr>\\n",
                "      <th>2</th>\\n",
                "      <td>Male</td>\\n",
                "      <td>Yes</td>\\n",
                "      <td>0</td>\\n",
               "      <td>Graduate</td>\\n",
                "      <td>Yes</td>\\n",
                "      <td>1.0</td>\\n",
                "      <td>Urban</td>\\n",
                "      <td>y</td>\\n",
                "      <td>8.006701</td>\\n",
                "      <td>4.204693</td>\\n",
                "      <td>5.888878</td>\\n",
               "      <td>8.006701</td>\\n",
                "    </tr>\\n",
               "    <tr>\\n",
                "      <th>3</th>\\n",
               "      <td>Male</td>\\n",
                "      <td>Yes</td>\\n",
                "      <td>0</td>\\n",
                "      <td>Not Graduate</td>\\n",
                "      <td>No</td>\\n",
                "      <td>1.0</td>\\n",
               "      <td>Rural</td>\\n",
                "      <td>y</td>\\n",
               "      <td>7.857094</td>\\n",
                "      <td>4.795791</td>\\n",
                "      <td>5.888878</td>\\n",
                "      <td>8.505525</td>\\n",
               "    </tr>\\n",
                "    <tr>\\n",
                "      <th>4</th>\\n",
                "      <td>Male</td>\\n",
               "      <td>No</td>\\n",
                "      <td>0</td>\\n",
                "      <td>Graduate</td>\\n",
               "      <td>No</td>\\n",
                "      <td>1.0</td>\\n",
               "      <td>Urban</td>\\n",
                "      <td>y</td>\\n",
                "      <td>8.699681</td>\\n",
               "      <td>4.955827</td>\\n",
                "      <td>5.888878</td>\\n",
               "      <td>8.699681</td>\\n",
                "    </tr>\\n",
                "  </tbody>\\n",
                "</table>\\n",
                "</div>"
            ],
            "text/plain": [
               "  gender married  dependents     education self_employed  credit_history  \\\n",
               "0   Male      No           0      Graduate            No             1.0   \n",
                "

  gender married  dependents     education self_employed  credit_history  
0   Male      No           0      Graduate            No             1.0   
1   Male     Yes           1      Graduate            No             1.0   
2   Male     Yes           0      Graduate           Yes             1.0   
3   Male     Yes           0  Not Graduate            No             1.0   
4   Male      No           0      Graduate            No             1.0   

  property_Area loan_status  applicantincome_log  loanamount_log  loan_amount_term_log  total_income_log  
0         Urban           y             8.674197        4.993232              5.888878          8.674197  
1         Rural           n             8.430327        4.859812              5.888878          8.714732  
2         Urban           y             8.006701        4.204693              5.888878          8.006701  
3         Urban           y             7.857094        4.795791              5.888878          8.505525  
4         Urban           y             8.699681        4.955827              5.888878          8.699681

Label Encoding:

Add a code cell to perform label encoding on the categorical features. This transforms your categorical data into numerical form that machine learning models can use.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = ['gender','married','education','self_employed','property_Area','loan_status']
for col in cols:
    df[col] = le.fit_transform(df[col])

Check Data Info:

Add a code cell to display the .info() of your dataframe. This will show the data types of your features after encoding.

df.info()

Model Building (Classification):

Split Data: Create a code cell to split your data into training and testing sets (e.g., 75% for training and 25% for testing):



In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('loan_status', axis=1)
Y = df['loan_status']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state= 42)

In [None]:
*   **Classification Function:** Create a code cell that defines a function to evaluate classification models using accuracy, precision, recall, F1-score, and ROC-AUC. Include cross-validation for better evaluation.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

def classify(model, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state= 42)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    print('Accuracy Score : ',accuracy_score(Y_test, Y_pred)*100)
    print('Precision Score : ',precision_score(Y_test, Y_pred)*100)
    print('Recall Score : ',recall_score(Y_test, Y_pred)*100)
    print('F1 Score : ',f1_score(Y_test, Y_pred)*100)
    # cross validation - used for better validiation of model
    score = cross_val_score(model, X, Y, cv = 5)
    print("Cross validation is, ",np.mean(score)*100)
    Y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(Y_test, Y_prob)
    roc_auc = roc_auc_score(Y_test, Y_prob)
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

In [None]:
 *   **Model Selection, Training & Evaluation:** Add code cells to apply the classification function to at least four different models:
    *   Logistic Regression
    *   Decision Tree
    *   Random Forest
    *   Extra Trees Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, Y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model, X, Y)

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
model = RandomForestClassifier()
classify(model, X, Y)

In [None]:
model = ExtraTreesClassifier()
classify(model, X, Y)

Model Building (Regression):

Data Preparation: First, we prepare the data by filtering out eligible customers for regression task, and creating data for predicting loan amount and duration

In [None]:
df_ineligible = df[df['loan_status']==0]
X_reg_amount = df_ineligible.drop(['loan_status','loan_id'],axis=1)
Y_reg_amount = df_ineligible['loanamount']

X_reg_duration = df_ineligible.drop(['loan_status','loanAmount', 'loan_id'], axis=1)
Y_reg_duration = df_ineligible['loan_amount_term']

In [None]:
*   **Regression Function:** Add a code cell to define a function that will evaluate regression models using R², MAE, MSE, and RMSE. Also, add residuals and actual vs. predicted values to have complete model evaluation.

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

def regression(model, X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f'R^2: {r2:.2f}')
    print(f'MAE: {mae:.2f}')
    print(f'MSE: {mse:.2f}')
    print(f'RMSE: {rmse:.2f}')

    # Residual plot
    residuals = y_test - y_pred
    plt.figure(figsize=(10, 5))
    plt.scatter(y_pred, residuals)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs. Predicted values')
    plt.show()

    # Actual vs. predicted values plot
    plt.figure(figsize=(10, 5))
    plt.scatter(y_test, y_pred)
    plt.xlabel('Actual values')
    plt.ylabel('Predicted values')
    plt.title('Actual vs. Predicted values')
    plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
print("Evaluating Loan Amount Regression")
regression(model, X_reg_amount, Y_reg_amount)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
print("Evaluating Minimum Loan Duration Regression")
regression(model, X_reg_duration, Y_reg_duration)

Hyperparameter Tuning:

Add code cells to perform hyperparameter tuning for each model using GridSearchCV. Do this separately for classification and regression tasks. These cells should also include model training and evaluations.

In [None]:
from sklearn.model_selection import GridSearchCV

def hyperparameter_tuning_classification(X,Y):
     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
     param_grid = {
         'n_estimators': [100, 200, 300],
         'max_depth': [5, 8, 10],
         'min_samples_leaf': [1, 3, 5],
        'min_samples_split':[2, 4, 6]
    }

     model = RandomForestClassifier(random_state=42)
     grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')
     grid_search.fit(X_train, y_train)

     best_model = grid_search.best_estimator_
     y_pred = best_model.predict(X_test)
     accuracy = accuracy_score(y_test, y_pred)
     precision = precision_score(y_test, y_pred)
     recall = recall_score(y_test, y_pred)
     f1 = f1_score(y_test, y_pred)
     roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])

     print("Best Parameters:", grid_search.best_params_)
     print(f'Accuracy: {accuracy:.2f}')
     print(f'Precision: {precision:.2f}')
     print(f'Recall: {recall:.2f}')
     print(f'F1 Score: {f1:.2f}')
     print(f'ROC AUC: {roc_auc:.2f}')

     fpr, tpr, thresholds = roc_curve(y_test, best_model.predict_proba(X_test)[:, 1])
     plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
     plt.plot([0, 1], [0, 1], 'k--')
     plt.xlabel('False Positive Rate')
     plt.ylabel('True Positive Rate')
     plt.title('ROC Curve')
     plt.legend()
     plt.show()

     return best_model
print("Classification Model hyperparameter tuning")
best_classification_model = hyperparameter_tuning_classification(X, Y)

In [None]:
from sklearn.model_selection import GridSearchCV

def hyperparameter_tuning_regression(X,Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 8, 10],
        'min_samples_leaf': [1, 3, 5],
         'min_samples_split':[2, 4, 6]
    }

    model = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')

    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    print("Best Parameters:", grid_search.best_params_)
    print(f'R^2: {r2:.2f}')
    print(f'MAE: {mae:.2f}')
    print(f'MSE: {mse:.2f}')
    print(f'RMSE: {rmse:.2f}')

    # Residual plot
    residuals = y_test - y_pred
    plt.figure(figsize=(10, 5))
    plt.scatter(y_pred, residuals)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs. Predicted values')
    plt.show()

    # Actual vs. predicted values plot
    plt.figure(figsize=(10, 5))
    plt.scatter(y_test, y_pred)
    plt.xlabel('Actual values')
    plt.ylabel('Predicted values')
    plt.title('Actual vs. Predicted values')
    plt.show()
    return best_model

print("Regression Model hyperparameter tuning for loan amount")
best_loan_amount_model = hyperparameter_tuning_regression(X_reg_amount, Y_reg_amount)
print("Regression Model hyperparameter tuning for loan duration")
best_loan_duration_model = hyperparameter_tuning_regression(X_reg_duration, Y_reg_duration)

Output files:

Add a code cell to create and save the output files, as mentioned in the instructions. Use the best performing model from above for making the prediction.

In [None]:
#Make prediction on testing set
testing_data_path = 'testing_set.csv'
testing_data = lem.load_data(testing_data_path)
if testing_data is None:
        raise FileNotFoundError("Unable to load testing data")
sample_eligibility_pred = lem.predict_loan_eligibility(best_classification_model, testing_data.iloc[0].to_dict())
predicted_eligibilities = []
predicted_max_amounts = []
predicted_min_durations = []
for i in range(len(testing_data)):
     input_data = testing_data.iloc[i].to_dict()
     predicted_eligibility, predicted_max_amount  = lem.predict_loan_eligibility(best_classification_model, input_data)
     max_loan_amount_pred = lem.predict_max_loan_amount(best_loan_amount_model, input_data)
     min_loan_duration_pred = lem.predict_min_loan_duration(best_loan_duration_model, input_data)

     predicted_eligibilities.append(predicted_eligibility)
     predicted_max_amounts.append(max_loan_amount_pred)
     predicted_min_durations.append(min_loan_duration_pred)

# Output CSV for Eligibility
eligibility_output_df = pd.DataFrame({'loan_id':testing_data['Loan_ID'],
                                  'predicted_eligibility': predicted_eligibilities})
eligibility_output_df.to_csv("eligibility_predictions.csv",index=False)

# Output CSV for Maximum Loan Amount and Minimum Duration
loan_output_df = pd.DataFrame({'loan_id': testing_data['Loan_ID'],
                               'predicted_max_amount': predicted_max_amounts,
                               'predicted_min_duration': predicted_min_durations})
loan_output_df.to_csv("loan_predictions.csv", index=False)