<a href="https://colab.research.google.com/github/TheBarmaEffect/Assignment-Interview-/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Load datasets (Make sure to upload the files to Colab first)
train_df = pd.read_csv('/content/Assignment_Train.csv')
test_df = pd.read_csv('/content/Assignment_Test.csv')
feature_dict = pd.read_excel('/content/Assignment_FeatureDictionary.xlsx')

# Display the Feature Dictionary to understand the features
print("Feature Dictionary:")
print(feature_dict.head())

# Create a dictionary to map feature names to their descriptions
feature_mapping = dict(zip(feature_dict['COLUMN NAME'], feature_dict['COLUMN DESCRIPTION']))

# Use the feature dictionary to guide preprocessing and feature engineering
categorical_features = []
numerical_features = []
special_handling_features = []

# Categorize features based on their descriptions
for feature, description in feature_mapping.items():
    description = str(description)  # Ensure the description is a string
    if 'categorical' in description.lower() or 'category' in description.lower():
        categorical_features.append(feature)
    elif 'numeric' in description.lower() or 'number' in description.lower() or 'score' in description.lower():
        numerical_features.append(feature)
    else:
        special_handling_features.append(feature)

print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)
print("Special Handling Features:", special_handling_features)

# Drop columns with 100% missing data in the test set
columns_to_drop = ['Phone Social Premium.a23games', 'Phone Social Premium.my11',
                   'Phone Social Premium.rummycircle', 'Phone Social Premium.yatra']
train_df_cleaned = train_df.drop(columns=columns_to_drop)
test_df_cleaned = test_df.drop(columns=columns_to_drop)

# Ensure all numerical features are numeric, convert non-numeric to NaN
for column in numerical_features:
    train_df_cleaned[column] = pd.to_numeric(train_df_cleaned[column], errors='coerce')
    test_df_cleaned[column] = pd.to_numeric(test_df_cleaned[column], errors='coerce')

# Identify and remove columns with only missing values (NaNs)
missing_train = train_df_cleaned[numerical_features].isna().all()
missing_test = test_df_cleaned[numerical_features].isna().all()
cols_to_remove = missing_train[missing_train].index.tolist() + missing_test[missing_test].index.tolist()
numerical_features = [col for col in numerical_features if col not in cols_to_remove]

# Re-check and remove any columns with remaining NaN values after imputation
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

train_df_cleaned[numerical_features] = imputer_num.fit_transform(train_df_cleaned[numerical_features])
test_df_cleaned[numerical_features] = imputer_num.transform(test_df_cleaned[numerical_features])

train_df_cleaned[categorical_features] = imputer_cat.fit_transform(train_df_cleaned[categorical_features])
test_df_cleaned[categorical_features] = imputer_cat.transform(test_df_cleaned[categorical_features])

# Double-check and drop any remaining columns with NaN values
train_df_cleaned.dropna(axis=1, inplace=True)
test_df_cleaned.dropna(axis=1, inplace=True)

# Feature Engineering: Scale numerical features (optional but often beneficial)
scaler = StandardScaler()
train_df_cleaned[numerical_features] = scaler.fit_transform(train_df_cleaned[numerical_features])
test_df_cleaned[numerical_features] = scaler.transform(test_df_cleaned[numerical_features])

# Prepare target and features
X = train_df_cleaned.drop(columns=['Application Status'])
y = train_df_cleaned['Application Status']

# Encode categorical variables
label_encoders = {}
for column in categorical_features:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    test_df_cleaned[column] = le.transform(test_df_cleaned[column])
    label_encoders[column] = le

# Ensure columns match
X = pd.get_dummies(X, drop_first=True)
test_df_encoded = pd.get_dummies(test_df_cleaned, drop_first=True)

# Ensure columns match
test_df_encoded = test_df_encoded.reindex(columns=X.columns, fill_value=0)

# Double-check for any remaining NaN values in X
if X.isnull().values.any():
    print("Warning: There are still NaN values in X after preprocessing.")
else:
    # Train a RandomForestClassifier
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X, y)

    # Evaluate the model on training data
    train_predictions = rf_model.predict(X)

    accuracy = accuracy_score(y, train_predictions)
    precision = precision_score(y, train_predictions, pos_label='APPROVED')
    recall = recall_score(y, train_predictions, pos_label='APPROVED')
    f1 = f1_score(y, train_predictions, pos_label='APPROVED')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y, train_predictions))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y, train_predictions))

    # Make predictions on the test data
    test_predictions = rf_model.predict(test_df_encoded)

    # Prepare output file
    output_df = pd.DataFrame({
        'UID': test_df['UID'],
        'Prediction': test_predictions
    })

    # Save the predictions to a CSV file
    output_df.to_csv('/content/predictions.csv', index=False)
    print("Predictions have been saved to 'predictions.csv'")

    # Download the file if running on Colab
    from google.colab import files
    files.download('/content/predictions.csv')


Feature Dictionary:
              COLUMN NAME                 COLUMN DESCRIPTION
0               DEALER ID    Unique identifier of the dealer
1  APPLICATION LOGIN DATE  Date of submission of application
2         HDB BRANCH NAME                Bank branch details
3        HDB BRANCH STATE                                NaN
4              FIRST NAME                  Submitted Details
Categorical Features: ['ASSET CTG']
Numerical Features: ['Cibil Score', 'MOBILE VERIFICATION', 'ASSET MODEL NO', 'Primary Asset Model No', 'phone_digitalage', 'phone_phoneFootprintStrengthOverall']
Special Handling Features: ['DEALER ID', 'APPLICATION LOGIN DATE', 'HDB BRANCH NAME', 'HDB BRANCH STATE', 'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'mobile', 'AADHAR VERIFIED', 'DEALER NAME', 'TOTAL ASSET COST', 'LOS ID', 'APPLIED AMOUNT', 'PRIMARY ASSET MAKE', 'Personal Email Address', 'MARITAL STATUS', 'GENDER', 'DOB', 'AGE', 'ADDRESS TYPE', 'EMPLOY CONSTITUTION', 'EMPLOYER NAME', 'EMPLOYER TYPE', 'Pan Name', '

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>