In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.pipeline import make_pipeline

import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore")

# Load the dataset
df = pd.read_csv("v2_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the Random Forest Classifier with class weights
rfc = RandomForestClassifier(random_state=2, class_weight={1:1, 2:2, 3:3})

# Train the model
rfc.fit(X_train, y_train)

# Predict on the test set
y_pred = rfc.predict(X_test)

# Calculate and print accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy of Random Forest with class weights and standardization: {acc:.4f}")


Accuracy of Random Forest with class weights and standardization: 0.7983


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("v2_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the Random Forest Classifier with class weights
rfc = RandomForestClassifier(random_state=2, class_weight={1:1, 2:2, 3:3})

# Train the model
rfc.fit(X_train, y_train)

# Predict on the test set
y_pred = rfc.predict(X_test)

# Calculate and print accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy of Random Forest with class weights and standardization: {acc:.4f}")


Accuracy of Random Forest with class weights and standardization: 0.7983


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from lightgbm import LGBMClassifier

# Load the dataset
df = pd.read_csv("v2_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the base models
rfc = RandomForestClassifier(random_state=10, class_weight={0: 1, 1: 2, 2: 3})
knn = KNeighborsClassifier(n_neighbors=5)
brf = BalancedRandomForestClassifier()
bb = BalancedBaggingClassifier()
lgbm = LGBMClassifier()

# Define the StackingClassifier with the base models
model = StackingClassifier(
    estimators=[
        ("rfc", rfc),
        ("knn", knn),
        ("brf", brf),
        ("bb", bb),
        ("lgbm", lgbm),
    ],
    final_estimator=RandomForestClassifier(random_state=10, class_weight={0: 1, 1: 2, 2: 3}),
)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and print accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy of StackingClassifier with standardization: {acc:.4f}")
print("Test Score:",model.score(X_test, y_test))


Accuracy of StackingClassifier with standardization: 0.8064
Test Score: 0.80635


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # enable the HistGradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier

# Load the dataset
df = pd.read_csv("v2_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Standardize the data
scaler = PowerTransformer(method='yeo-johnson', standardize=True)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the base models
bagging = BaggingClassifier(n_jobs=-1)
extraTrees = ExtraTreesClassifier(max_depth=10, n_jobs=-1)
randomForest = RandomForestClassifier(n_jobs=-1)
histGradientBoosting = HistGradientBoostingClassifier()
XGB = XGBClassifier(n_jobs=-1)

# Define the StackingClassifier with the base models
model = StackingClassifier(
    estimators=[
        ("bagging", bagging),
        ("extraTrees", extraTrees),
        ("randomForest", randomForest),
        ("histGradientBoosting", histGradientBoosting),
        ("XGB", XGB),
    ],
    
)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and print accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy of StackingClassifier with standardization: {acc:.4f}")


Accuracy of StackingClassifier with standardization: 0.7972


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# Load the dataset
df = pd.read_csv("v2_train.csv")


# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the base models
rfc = RandomForestClassifier(random_state=10, class_weight={0: 1, 1: 2, 2: 3})
knn = KNeighborsClassifier(n_neighbors=5)
dtc = DecisionTreeClassifier()
nn = MLPClassifier(hidden_layer_sizes=(100,))
xgb = XGBClassifier()
cat = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=False)
lgbm = LGBMClassifier()

# Define the StackingClassifier with the base models
model = StackingClassifier(
    estimators=[
        ("rfc", rfc),
        ("knn", knn),
        ("dtc", dtc),
        ("nn", nn),
        ("xgb", xgb),
        ("cat", cat),
        ("lgbm", lgbm),
    ],
    final_estimator=RandomForestClassifier(random_state=10, class_weight={0: 1, 1: 2, 2: 3}),
)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and print accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy of StackingClassifier with standardization: {acc:.4f}")


Learning rate set to 0.098942
0:	learn: 1.0370432	total: 157ms	remaining: 2m 37s
1:	learn: 0.9893051	total: 176ms	remaining: 1m 27s
2:	learn: 0.9489309	total: 195ms	remaining: 1m 4s
3:	learn: 0.9156802	total: 219ms	remaining: 54.4s
4:	learn: 0.8871635	total: 245ms	remaining: 48.8s
5:	learn: 0.8632830	total: 271ms	remaining: 44.9s
6:	learn: 0.8437032	total: 304ms	remaining: 43.1s
7:	learn: 0.8252578	total: 332ms	remaining: 41.2s
8:	learn: 0.8092290	total: 361ms	remaining: 39.7s
9:	learn: 0.7956158	total: 390ms	remaining: 38.6s
10:	learn: 0.7839986	total: 422ms	remaining: 37.9s
11:	learn: 0.7733737	total: 450ms	remaining: 37s
12:	learn: 0.7644073	total: 478ms	remaining: 36.3s
13:	learn: 0.7568547	total: 504ms	remaining: 35.5s
14:	learn: 0.7496377	total: 528ms	remaining: 34.7s
15:	learn: 0.7431277	total: 555ms	remaining: 34.1s
16:	learn: 0.7379431	total: 575ms	remaining: 33.3s
17:	learn: 0.7331701	total: 598ms	remaining: 32.6s
18:	learn: 0.7290428	total: 619ms	remaining: 32s
19:	learn: 0.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier

# Load the dataset
df = pd.read_csv("v2_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the base models
rfc = RandomForestClassifier(random_state=10, class_weight={0: 1, 1: 2, 2: 3})
knn = KNeighborsClassifier(n_neighbors=5)
gbc = GradientBoostingClassifier()

# Define the VotingClassifier with the base models
model = VotingClassifier(
    estimators=[
        ("rfc", rfc),
        ("knn", knn),
        ("gbc", gbc),
    ],
    voting="soft",
)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and print accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy of VotingClassifier with standardization: {acc:.4f}")


Accuracy of VotingClassifier with standardization: 0.7808


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("v2_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the base estimator
base_estimator = DecisionTreeClassifier(max_depth=3)

# Define the AdaBoostClassifier with the base estimator
model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, learning_rate=0.1, random_state=10)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate and print accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy of AdaBoostClassifier with DecisionTreeClassifier: {acc:.4f}")


Accuracy of AdaBoostClassifier with DecisionTreeClassifier: 0.6903


In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv("v1_train.csv")

# Split the data into features (X) and target (y)
X = df.drop('Credit_Score', axis=1)
y = df['Credit_Score']

# Create a random forest classifier with 100 trees
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the data
rf.fit(X, y)

# Get the feature importances
importances = rf.feature_importances_

# Get the feature names
feature_names = X.columns

# Create a DataFrame of feature importances
fi = pd.DataFrame({'feature': feature_names, 'importance': importances})

# Sort the DataFrame by importance in descending order
fi = fi.sort_values('importance', ascending=False)

fi

NameError: name 'RandomForestClassifier' is not defined

In [None]:
# Load the dataset
df = pd.read_csv("v1_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Naive Bayes", GaussianNB()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Neural Network", MLPClassifier()),
    ("Gradient Boosting Machines", GradientBoostingClassifier()),   
    ("LightGBM", LGBMClassifier()),
    ("Decision Trees", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
]
accuracy_dict={}
# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_dict[name]=acc

# Print the accuracy dictionary sorted by value in descending order
sorted_accuracy = sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True)
for name, acc in sorted_accuracy:
    print(f"{name} Accuracy: {acc:.4f}")

Random Forest Accuracy: 0.7941
LightGBM Accuracy: 0.7258
Gradient Boosting Machines Accuracy: 0.6966
Decision Trees Accuracy: 0.6931
K-Nearest Neighbors Accuracy: 0.6520
Naive Bayes Accuracy: 0.6312
Logistic Regression Accuracy: 0.5802
Neural Network Accuracy: 0.5253


In [None]:
# Load the dataset
df = pd.read_csv("v1_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Naive Bayes", GaussianNB()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Neural Network", MLPClassifier()),
    ("Gradient Boosting Machines", GradientBoostingClassifier()),   
    ("LightGBM", LGBMClassifier()),
    ("Decision Trees", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
]

accuracy_dict={}
# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_dict[name]=acc

# Print the accuracy dictionary sorted by value in descending order
sorted_accuracy = sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True)
for name, acc in sorted_accuracy:
    print(f"{name} Accuracy: {acc:.4f}")

Random Forest Accuracy: 0.7943
LightGBM Accuracy: 0.7262
K-Nearest Neighbors Accuracy: 0.6994
Gradient Boosting Machines Accuracy: 0.6965
Decision Trees Accuracy: 0.6924
Neural Network Accuracy: 0.6914
Logistic Regression Accuracy: 0.6353
Naive Bayes Accuracy: 0.6318


In [None]:
# Load the dataset
df = pd.read_csv("v2_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Naive Bayes", GaussianNB()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Neural Network", MLPClassifier()),
    ("Gradient Boosting Machines", GradientBoostingClassifier()),   
    ("LightGBM", LGBMClassifier()),
    ("Decision Trees", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
]

accuracy_dict={}
# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_dict[name]=acc

# Print the accuracy dictionary sorted by value in descending order
sorted_accuracy = sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True)
for name, acc in sorted_accuracy:
    print(f"{name} Accuracy: {acc:.4f}")


Random Forest Accuracy: 0.7935
LightGBM Accuracy: 0.7286
Gradient Boosting Machines Accuracy: 0.6971
Decision Trees Accuracy: 0.6954
K-Nearest Neighbors Accuracy: 0.6520
Naive Bayes Accuracy: 0.6308
Neural Network Accuracy: 0.6036
Logistic Regression Accuracy: 0.5774


In [None]:
# Load the dataset
df = pd.read_csv("v2_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Naive Bayes", GaussianNB()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Neural Network", MLPClassifier()),
    ("Gradient Boosting Machines", GradientBoostingClassifier()),   
    ("LightGBM", LGBMClassifier()),
    ("Decision Trees", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
]

accuracy_dict={}
# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_dict[name]=acc

# Print the accuracy dictionary sorted by value in descending order
sorted_accuracy = sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True)
for name, acc in sorted_accuracy:
    print(f"{name} Accuracy: {acc:.4f}")

Random Forest Accuracy: 0.7953
LightGBM Accuracy: 0.7284
K-Nearest Neighbors Accuracy: 0.7278
Decision Trees Accuracy: 0.6987
Gradient Boosting Machines Accuracy: 0.6972
Neural Network Accuracy: 0.6902
Logistic Regression Accuracy: 0.6341
Naive Bayes Accuracy: 0.6310


In [None]:
# Load the dataset
df = pd.read_csv("v1_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Define the models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Naive Bayes", GaussianNB()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Neural Network", MLPClassifier()),
    ("Gradient Boosting Machines", GradientBoostingClassifier()),   
    ("LightGBM", LGBMClassifier()),
    ("Decision Trees", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
]

# Train and evaluate each model
accuracy_dict = {}
for name, model in models:
    model.fit(X_train_scaled, y_train_resampled)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    accuracy_dict[name] = acc

# Print the model accuracies sorted by accuracy
sorted_models = sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True)
for model, acc in sorted_models:
    print(f"{model}: {acc:.4f}")


Random Forest: 0.7757
LightGBM: 0.7169
Gradient Boosting Machines: 0.6810
Decision Trees: 0.6740
K-Nearest Neighbors: 0.6714
Neural Network: 0.6691
Logistic Regression: 0.6238
Naive Bayes: 0.5981


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
df = pd.read_csv("v1_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the random_state values to try
random_states = [2,3,5,10, 20, 30, 40, 50]

# Train and evaluate the model with different random_state values
for rs in random_states:
    # print(f"Training Random Forest with random_state={rs}...")
    rf = RandomForestClassifier(random_state=rs)
    rf.fit(X_train_scaled, y_train)
    y_pred = rf.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    print(f"Random Forest Accuracy with random_state={rs}: {acc:.4f}")


Random Forest Accuracy with random_state=2: 0.7974
Random Forest Accuracy with random_state=3: 0.7931
Random Forest Accuracy with random_state=5: 0.7931
Random Forest Accuracy with random_state=10: 0.7966
Random Forest Accuracy with random_state=20: 0.7939
Random Forest Accuracy with random_state=30: 0.7944
Random Forest Accuracy with random_state=40: 0.7944
Random Forest Accuracy with random_state=50: 0.7934


In [None]:

# Load the dataset
df = pd.read_csv("v1_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define class weights for the RandomForestClassifier
class_weights = {1: 1, 2: 2, 3: 3}

# Define the models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Naive Bayes", GaussianNB()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Neural Network", MLPClassifier()),
    ("Gradient Boosting Machines", GradientBoostingClassifier()),   
    ("LightGBM", LGBMClassifier()),
    ("Decision Trees", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier(class_weight=class_weights)),
]

# Train and evaluate each model
accuracy_dict = {}
for name, model in models:
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    accuracy_dict[name] = acc
    

# Print the accuracy of each model in descending order
print("Accuracy of each model:")
for name, acc in sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True):
    print(f"{name}: {acc:.4f}")


Accuracy of each model:
Random Forest: 0.7944
LightGBM: 0.7262
K-Nearest Neighbors: 0.6994
Decision Trees: 0.6967
Gradient Boosting Machines: 0.6966
Neural Network: 0.6874
Logistic Regression: 0.6353
Naive Bayes: 0.6318


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
df = pd.read_csv("v1_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models
models = [      ("Balanced Bagging", BalancedBaggingClassifier()),    
                ("Balanced Random Forest", BalancedRandomForestClassifier()),    
                ("Easy Ensemble", EasyEnsembleClassifier()),     
              ]

accuracy_dict = {}
# Train and evaluate the models
for name, model in models:
    if name == "Random Forest":
        model.set_params(class_weight=class_weights)
    if name == "K-Nearest Neighbors":
        model.set_params(n_jobs=-1)
    if name == "Neural Network":
        model.set_params(hidden_layer_sizes=(100,), max_iter=500)
    if name == "Gradient Boosting Machines":
        model.set_params(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
    if name == "LightGBM":
        model.set_params(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    accuracy_dict[name] = acc

print("Accuracy of each model:")
for name, acc in sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True):
    print(f"{name}: {acc:.4f}")


Accuracy of each model:
Balanced Bagging: 0.7639
Balanced Random Forest: 0.7461
Easy Ensemble: 0.6587


In [None]:
from imblearn.under_sampling import ClusterCentroids
from sklearn.linear_model import LogisticRegression

# Load the dataset
df = pd.read_csv("v1_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Undersample the majority class using ClusterCentroids
cc = ClusterCentroids(random_state=42)
X_train_resampled, y_train_resampled = cc.fit_resample(X_train, y_train)

# Train a logistic regression classifier on the resampled training set
lr = LogisticRegression()
lr.fit(X_train_resampled, y_train_resampled)

# Evaluate the classifier on the test set
y_pred = lr.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")


Accuracy: 0.4577


In [None]:
lr=RandomForestClassifier()
lr.fit(X_train_resampled, y_train_resampled)

# Evaluate the classifier on the test set
y_pred = lr.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

Accuracy: 0.6290


In [None]:
from imblearn.over_sampling import KMeansSMOTE

# Load the dataset
df = pd.read_csv("v1_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Generate synthetic samples using KMeansSMOTE
kmeans_smote = KMeansSMOTE()
X_train_resampled, y_train_resampled = kmeans_smote.fit_resample(X_train_scaled, y_train)

# Define the models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Gaussian Naive Bayes", GaussianNB()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("LightGBM", LGBMClassifier()),
    ("Neural Network", MLPClassifier(max_iter=1000)),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Balanced Bagging", BalancedBaggingClassifier()),    
    ("Balanced Random Forest", BalancedRandomForestClassifier()),    
    ("Easy Ensemble", EasyEnsembleClassifier()), 
]

# Train and evaluate the models
accuracy_dict = {}
for name, model in models:
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    accuracy_dict[name] = acc

# Print the accuracy of each model
print("Accuracy of each model:")
for name, acc in sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True):
    print(f"{name}: {acc:.4f}")


Accuracy of each model:
Balanced Random Forest: 0.7831
Random Forest: 0.7812
Balanced Bagging: 0.7620
LightGBM: 0.7215
K-Nearest Neighbors: 0.6928
Decision Tree: 0.6886
Neural Network: 0.6805
Easy Ensemble: 0.6532
Logistic Regression: 0.6512
Gaussian Naive Bayes: 0.6283


In [None]:
# Load the dataset
df = pd.read_csv("v2_train.csv")

# Split the dataset into training and test sets
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data using Yeo-Johnson transformation
scaler = PowerTransformer(method='yeo-johnson', standardize=True)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Naive Bayes", GaussianNB()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Neural Network", MLPClassifier()),
    ("Gradient Boosting Machines", GradientBoostingClassifier()),   
    ("LightGBM", LGBMClassifier()),
    ("Decision Trees", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
]

accuracy_dict={}
# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_dict[name]=acc

# Print the accuracy dictionary sorted by value in descending order
sorted_accuracy = sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True)
for name, acc in sorted_accuracy:
    print(f"{name} Accuracy: {acc:.4f}")


Random Forest Accuracy: 0.7940
LightGBM Accuracy: 0.7285
K-Nearest Neighbors Accuracy: 0.7231
Decision Trees Accuracy: 0.6976
Gradient Boosting Machines Accuracy: 0.6972
Neural Network Accuracy: 0.6963
Logistic Regression Accuracy: 0.6375
Naive Bayes Accuracy: 0.6369


<br>
<br>

## Juan Part

In [132]:
# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelEncoder
from datasist.structdata import detect_outliers
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import category_encoders as ce
import re 
import pandas as pd

# Modeling and evaluation 
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier
)
from xgboost import XGBClassifier
from sklearn.metrics import classification_report


df = pd.read_csv("v2_train.csv")
df=df.drop('Payment_of_Min_Amount',axis=1)
df=df.drop('Num_of_Loan',axis=1)
# df=df.drop('Occupation Number',axis=1)
# df=df.drop('Num_Bank_Accounts',axis=1)

### Data Spliting

Try Resampling

In [133]:
# define dataset
X, y = df.drop("Credit_Score",axis=1).values , df["Credit_Score"] 
y.value_counts(normalize=True)

2    0.53174
1    0.28998
3    0.17828
Name: Credit_Score, dtype: float64

### Apply oversampling


In [134]:
y.value_counts(normalize=True)

2    0.53174
1    0.28998
3    0.17828
Name: Credit_Score, dtype: float64

In [135]:
from imblearn.over_sampling import SMOTE
rus = SMOTE(sampling_strategy='auto')
X_data_rus, y_data_rus = rus.fit_resample(X, y)

In [136]:
y_data_rus.value_counts(normalize=True)

3    0.333333
2    0.333333
1    0.333333
Name: Credit_Score, dtype: float64

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X_data_rus, y_data_rus, test_size=0.2, random_state=42,stratify=y_data_rus)

### Handling Numerical

In [138]:
scalar = PowerTransformer(method='yeo-johnson', standardize=True).fit(X_train)

In [139]:
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

## Modeling and Evaluation

Model Building

In [140]:
bagging = BaggingClassifier(n_jobs=-1)
extraTrees = ExtraTreesClassifier(max_depth=10, n_jobs=-1)
randomForest = RandomForestClassifier(n_jobs=-1,random_state=10, class_weight={0: 1, 1: 2, 2: 3})
histGradientBoosting = HistGradientBoostingClassifier()
XGB = XGBClassifier(n_jobs=-1)

model = StackingClassifier(
    estimators=[
    ('bagging', bagging),
    ('extraTress', extraTrees),
    ('randomforest', randomForest),
    ('histGradientBoosting', histGradientBoosting),
    ('XGB', XGB)
], n_jobs=-1,final_estimator=RandomForestClassifier(random_state=10, class_weight={0: 1, 1: 2, 2: 3}) )



In [141]:
model.fit(X_train, y_train)

In [142]:
print("Train Score: ",model.score(X_train, y_train))

Train Score:  0.9807627510441399


In [143]:
print("Test Score:",model.score(X_test, y_test))

Test Score: 0.8672621846105626


In [144]:
y_pred = model.predict(X_test)

In [145]:
acc = accuracy_score(y_test, y_pred)
acc

0.8672621846105626

In [146]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           1       0.91      0.85      0.88     11444
           2       0.78      0.86      0.81      9631
           3       0.91      0.90      0.90     10830

    accuracy                           0.87     31905
   macro avg       0.87      0.87      0.87     31905
weighted avg       0.87      0.87      0.87     31905

