In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the datasets
train_file_path = r'C:\Users\vrhso\Downloads\new task\Assignment_Train.csv'
test_file_path = r'C:\Users\vrhso\Downloads\new task\Assignment_Test.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# --- 1. Handle Missing Data ---
categorical_cols = ['HDB BRANCH NAME', 'HDB BRANCH STATE', 'MIDDLE NAME', 'LAST NAME', 'DEALER NAME', 
                    'MARITAL STATUS', 'ADDRESS TYPE', 'EMPLOY CONSTITUTION', 'EMPLOYER NAME', 'EMPLOYER TYPE', 'Pan Name']

numerical_cols = ['Cibil Score', 'TOTAL ASSET COST', 'ASSET CTG', 'APPLIED AMOUNT']

train_data[categorical_cols] = train_data[categorical_cols].fillna('Unknown')
test_data[categorical_cols] = test_data[categorical_cols].fillna('Unknown')

train_data[numerical_cols] = train_data[numerical_cols].apply(pd.to_numeric, errors='coerce')
test_data[numerical_cols] = test_data[numerical_cols].apply(pd.to_numeric, errors='coerce')

train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')

numerical_cols = train_data.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col in test_data.columns]

imputer = SimpleImputer(strategy='median')
train_data[numerical_cols] = imputer.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = imputer.transform(test_data[numerical_cols])

# --- 2. Handle Date Columns (Remove or Convert) ---
date_cols = ['DATE OF BIRTH']
train_data = train_data.drop(columns=date_cols, errors='ignore')
test_data = test_data.drop(columns=date_cols, errors='ignore')

# --- 3. Align Features Between Train and Test Datasets ---
uid_column = None
if 'UID' in test_data.columns:
    uid_column = test_data['UID'].copy()
    test_data = test_data.drop(columns=['UID'])

missing_cols_in_test = set(train_data.columns) - set(test_data.columns)
missing_cols_in_train = set(test_data.columns) - set(train_data.columns)

for col in missing_cols_in_test:
    test_data[col] = np.nan
test_data = test_data[train_data.columns]

for col in missing_cols_in_train:
    train_data[col] = np.nan
train_data = train_data[test_data.columns]

# --- 4. Encode Categorical Variables ---
encode_cols = ['HDB BRANCH STATE', 'AADHAR VERIFIED', 'MOBILE VERIFICATION', 'MARITAL STATUS', 'GENDER']
label_encoder = LabelEncoder()

for col in encode_cols:
    train_data[col] = label_encoder.fit_transform(train_data[col].astype(str))
    test_data[col] = label_encoder.transform(test_data[col].astype(str))

# --- 5. Prepare Data for Modeling ---
X = train_data.drop(columns=['Application Status'])
y = train_data['Application Status']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 6. Handle NaNs and Infinities Before Scaling ---
X_train_numeric = X_train[numerical_cols]
X_val_numeric = X_val[numerical_cols]
test_data_numeric = test_data[numerical_cols]

X_train_numeric = X_train_numeric.fillna(X_train_numeric.median())
X_val_numeric = X_val_numeric.fillna(X_val_numeric.median())
test_data_numeric = test_data_numeric.fillna(test_data_numeric.median())

X_train_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
X_val_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
test_data_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)

X_train_numeric = imputer.fit_transform(X_train_numeric)
X_val_numeric = imputer.transform(X_val_numeric)
test_data_numeric = imputer.transform(test_data_numeric)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_numeric)
X_val_scaled = scaler.transform(X_val_numeric)
X_test_scaled = scaler.transform(test_data_numeric)

# --- 7. Train the Model ---
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_

# --- 8. Evaluate the Model on Validation Set ---
y_val_pred = best_model.predict(X_val_scaled)

print("Best cross-validation accuracy:", grid_search.best_score_)
print("Validation accuracy:", accuracy_score(y_val, y_val_pred))
print("Classification Report on validation set:\n", classification_report(y_val, y_val_pred))

# --- 9. Make Predictions on the Test Set ---
test_predictions = best_model.predict(X_test_scaled)

# --- 10. Save Predictions to CSV ---
if uid_column is not None:
    output_df = pd.DataFrame({
        'UID': uid_column,
        'Prediction': test_predictions
    })
else:
    output_df = pd.DataFrame({
        'Prediction': test_predictions
    })

output_df.to_csv('predictions.csv', index=False)

print("Predictions saved to 'predictions.csv'.")


Best cross-validation accuracy: 0.8779999999999999
Validation accuracy: 0.8685
Classification Report on validation set:
               precision    recall  f1-score   support

    APPROVED       0.88      0.93      0.90      1327
    DECLINED       0.84      0.75      0.79       673

    accuracy                           0.87      2000
   macro avg       0.86      0.84      0.85      2000
weighted avg       0.87      0.87      0.87      2000

Predictions saved to 'predictions.csv'.
