In [24]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import time

In [25]:
print("Loading datasets...")
train_df = pd.read_csv('train.csv', low_memory=False)
test_df = pd.read_csv('test.csv', low_memory=False)
print("Loading of Dataset Completed!!")

Loading datasets...
Loading of Dataset Completed!!


In [26]:
def clean_data(df):
    print("Cleaning data...")
    if 'Age' in df.columns:
        df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
        df['Age'] = df['Age'].apply(lambda x: np.nan if x < 0 else x)
    
    if 'Credit_History_Age' in df.columns:
        df['Credit_History_Age'] = df['Credit_History_Age'].astype(str).str.extract(r'(\d+)')[0].astype(float)
        df['Credit_History_Age'].fillna(df['Credit_History_Age'].mode()[0], inplace=True)
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    imputer = SimpleImputer(strategy='mean')
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    
    unnecessary_cols = ['ID', 'Name', 'SSN', 'Customer_ID', 'Month']
    df.drop(columns=[col for col in unnecessary_cols if col in df.columns], inplace=True)
    
    return df


In [27]:
train_df = clean_data(train_df)
print("Cleaning of Training Dataset Completed!!")
test_df = clean_data(test_df)
print("Cleaning of Testing Dataset Completed!!")

Cleaning data...
Cleaning of Training Dataset Completed!!
Cleaning data...
Cleaning of Testing Dataset Completed!!


In [28]:
test_df = test_df[train_df.columns.drop('Credit_Score')]

In [29]:
X = train_df.drop(columns=['Credit_Score'])
y = train_df['Credit_Score']

In [30]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [31]:
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=[np.number]).columns

In [32]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [34]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

print("Splitting data into training and validation sets...")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("Done with Splitting data into training and validation sets!!")

print("Training the model...")
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

print("Model training completed.")
print(f"Model Training Time: {end_time - start_time} seconds")

Splitting data into training and validation sets...
Done with Splitting data into training and validation sets!!
Training the model...
Model training completed.
Model Training Time: 500.34860825538635 seconds


In [35]:
print("Predicting on validation set...")
y_pred = model.predict(X_val)

Predicting on validation set...


In [36]:
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)

print(f"Validation Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Validation Accuracy: 0.77385
Classification Report:
              precision    recall  f1-score   support

        Good       0.76      0.69      0.72      3527
        Poor       0.76      0.77      0.77      5874
    Standard       0.78      0.80      0.79     10599

    accuracy                           0.77     20000
   macro avg       0.77      0.75      0.76     20000
weighted avg       0.77      0.77      0.77     20000



In [37]:
print("Predicting on test set...")
test_predictions = model.predict(test_df)

Predicting on test set...


In [38]:
test_predictions = label_encoder.inverse_transform(test_predictions)

In [39]:
test_df_with_ids = pd.read_csv('test.csv', usecols=['ID'])
test_results = pd.DataFrame({
    'ID': test_df_with_ids['ID'],  
    'Credit_Score': test_predictions
})

In [40]:
print("Saving results to CSV file...")
test_results.to_csv('test_predictions.csv', index=False)
print("Results saved successfully.")

Saving results to CSV file...
Results saved successfully.


In [41]:
from sklearn import set_config
set_config(display='diagram')
model