In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("UCI_Credit_Card.csv")

# Let's display the data
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


# 2. Data Cleaning

In [2]:
# Check for missing values
print(df.isna().sum())

# Handle missing values (if any)
# For now, let's drop rows with missing values if there are any (can be modified based on strategy)
df_cleaned = df.dropna()

# Display the cleaned data
df_cleaned.info()

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------          

# 2. Feature Selection
- You can use `SelectKBest` from `sklearn.feature_selection` to choose the best features based on some statistical test, like `f_classif`.

In [3]:
from sklearn.feature_selection import SelectKBest, f_classif

# Separating the target variable (default payment) and features
X = df_cleaned.drop(columns=['default.payment.next.month', 'ID'])
y = df_cleaned['default.payment.next.month']

# Applying SelectKBest to pick the top 10 features
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)

# Display the selected features
selected_features = selector.get_support(indices=True)
print(X.columns[selected_features])

Index(['LIMIT_BAL', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
       'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT4'],
      dtype='object')


# 3. Scaling
Since the features have different scales, it’s important to scale them using `StandardScaler`

In [4]:
from sklearn.preprocessing import StandardScaler

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)

# Display the scaled data
print(X_scaled[:5])

[[-1.13672015  1.79456386  1.78234817 -0.69666346 -0.66659873 -1.53004603
  -1.48604076 -0.34194162 -0.22708564 -0.30806256]
 [-0.3659805  -0.87499115  1.78234817  0.1388648   0.18874609  0.23491652
   1.99231551 -0.34194162 -0.21358766 -0.24422965]
 [-0.59720239  0.01486052  0.1117361   0.1388648   0.18874609  0.23491652
   0.25313738 -0.25029158 -0.19188673 -0.24422965]
 [-0.90549825  0.01486052  0.1117361   0.1388648   0.18874609  0.23491652
   0.25313738 -0.22119058 -0.16936116 -0.23784635]
 [-0.90549825 -0.87499115  0.1117361  -0.69666346  0.18874609  0.23491652
   0.25313738 -0.22119058  1.33503416  0.26643369]]


# . Model Building
You can try different models like Decision Tree, KNN, SVM, and Logistic Regression.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Assuming X_scaled and y are already defined

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Dictionary to store results of different models
results = {}

# List of models to try
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()
}

# Function to evaluate each model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Return the metrics as a dictionary
    return {
        "Precision": precision,
        "Recall": recall,
        "Accuracy": accuracy,
        "F1-Score": f1
    }

# Evaluate each model
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    results[model_name] = evaluate_model(model, X_train, X_test, y_train, y_test)

# Display results for all models
for model_name, metrics in results.items():
    print(f"\n{model_name} Results:")
    for metric_name, score in metrics.items():
        print(f"{metric_name}: {score:.4f}")

# Find the best model based on F1-Score
best_model = max(results, key=lambda x: results[x]['F1-Score'])
print(f"\nBest model based on F1-Score: {best_model}")

Evaluating Logistic Regression...
Evaluating Decision Tree...
Evaluating KNN...
Evaluating SVM...

Logistic Regression Results:
Precision: 0.6901
Recall: 0.2204
Accuracy: 0.8087
F1-Score: 0.3341

Decision Tree Results:
Precision: 0.3742
Recall: 0.3750
Accuracy: 0.7273
F1-Score: 0.3746

KNN Results:
Precision: 0.5569
Recall: 0.3571
Accuracy: 0.7981
F1-Score: 0.4352

SVM Results:
Precision: 0.6705
Recall: 0.3531
Accuracy: 0.8213
F1-Score: 0.4626

Best model based on F1-Score: SVM
