In [61]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

test_copy = pd.read_csv("CPC_Assignment files/test_with_predictions.csv")

# Encode categorical variables
label_encoders = {}
for col in ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'property_Area']:
    le = LabelEncoder()
    test_copy[col] = le.fit_transform(test_copy[col].astype(str))
    label_encoders[col] = le

# Split data into features and target for training the regressor
features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
            'ApplicantIncome', 'CoapplicantIncome', "LoanAmount", 'Loan_Amount_Term', 'Credit_History', 'property_Area', 'LoanAmount_log']

loan_status_0 = test_copy[test_copy['Loan_Status'] == 0]

# Features for the regression model (excluding the target variable)
X_loan_status_0 = loan_status_0[features]

# Target variable (LoanAmount)
y_loan_status_0 = loan_status_0['LoanAmount']

X_train, X_test, y_train, y_test = train_test_split(X_loan_status_0, y_loan_status_0, test_size=0.2, random_state=42)

best_gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
best_gb_model.fit(X_train, y_train)

y_pred = best_gb_model.predict(X_test)

# Calculate the Mean Squared Error (MSE) or any other metric to evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Data: {mse}")

# Now make predictions for the full Loan_Status = 0 data
y_pred_full_loan_amount = best_gb_model.predict(X_loan_status_0)

# Add the predicted loan amounts to the dataset
loan_status_0['Predicted_LoanAmount'] = y_pred_full_loan_amount

loan_status_0['Predicted_LoanAmount'] = loan_status_0['Predicted_LoanAmount'].round(2)

print("Predicted Loan Amounts for Loan_Status = 0 :")
print(loan_status_0[['Predicted_LoanAmount']])

# Create a new column for the minimum eligible loan term for each individual
loan_status_0['Min_Eligible_Loan_Term'] = None


# Iterate through each individual to find their minimum loan duration (if term ≤ 20 years)
for index, row in loan_status_0.iterrows():
    # Filter rows for this individual with term ≤ 240 months
    eligible_terms = loan_status_0[
        (loan_status_0['Loan_Amount_Term'] <= 240) &
        (loan_status_0['Predicted_LoanAmount'] > 0) &
        (loan_status_0.index == index)  # Ensure it's for this specific person
    ]
    
    # Determine the minimum eligible term, if any
    if not eligible_terms.empty:
        loan_status_0.at[index, 'Min_Eligible_Loan_Term'] = eligible_terms['Loan_Amount_Term'].min()
    else:
        loan_status_0.at[index, 'Min_Eligible_Loan_Term'] = 'None'
    print(f"Index: {index}, Eligible Terms: {eligible_terms}")

loan_status_0 = loan_status_0.reset_index(drop=True)
test_copy.loc[test_copy['Loan_Status'] == 0, 'Min_Eligible_Loan_Term'] = loan_status_0['Min_Eligible_Loan_Term']


# Update the original dataset with the predicted loan amounts
test_copy.loc[test_copy['Loan_Status'] == 0, 'Predicted_LoanAmount'] = y_pred_full_loan_amount.round(2)

# Update the original dataset with the new column
test_copy.loc[test_copy['Loan_Status'] == 0, 'Min_Eligible_Loan_Term'] = loan_status_0['Min_Eligible_Loan_Term']


# Save the updated dataset to a new file
output_file = "CPC_Assignment files/test_with_predictions.csv"
test_copy.to_csv(output_file, index=False)
print(f"\nUpdated file written to {output_file}")

print(test_copy[test_copy['Loan_Status'] == 0][['Loan_Amount_Term', 'Min_Eligible_Loan_Term']])
    


Mean Squared Error on Test Data: 49.79994021750834
Predicted Loan Amounts for Loan_Status = 0 :
     Predicted_LoanAmount
7                  148.12
13                 165.98
25                 147.99
35                 175.99
55                 130.08
..                    ...
317                 67.07
325                 95.05
339                162.00
346                134.89
354                157.98

[61 rows x 1 columns]
Index: 7, Eligible Terms: Empty DataFrame
Columns: [Gender, Married, Dependents, Education, Self_Employed, ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, property_Area, LoanAmount_log, Loan_Status, Predicted_LoanAmount, Min_Eligible_Loan_Term]
Index: []
Index: 13, Eligible Terms: Empty DataFrame
Columns: [Gender, Married, Dependents, Education, Self_Employed, ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, property_Area, LoanAmount_log, Loan_Status, Predicted_LoanAmount, Min_Eligible_Loan_Term]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan_status_0['Predicted_LoanAmount'] = y_pred_full_loan_amount
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan_status_0['Predicted_LoanAmount'] = loan_status_0['Predicted_LoanAmount'].round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan_status_0['Min_Eligible_Loan_Term'] = None
 nan na