## Introduction

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


## Loading the Dataset

In [None]:
df = pd.read_csv('train.csv')
df

## Basic Checks

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

## Checking Target Distribution

We check how many customers are labeled as 0 (no transaction) and 1 (will make a transaction).


## Descriptive Statistics

Displays basic summary stats for each feature to understand the data range and distribution.


In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['target'].value_counts(normalize=True)

In [None]:
df.drop(columns=['ID_code'], inplace = True)

In [None]:
df

## Data Preprocessing

In [None]:
# Separating Features and Target
X = df.drop('target', axis=1)
y = df['target']


In [None]:
from sklearn.model_selection import train_test_split

# Spliting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


## Preprocessing: Feature Scaling

StandardScaler is used to normalize feature values before model training.


In [None]:
from sklearn.preprocessing import StandardScaler

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Checking class imbalance

In [None]:
sns.countplot(x=y)
plt.title("Target Class Distribution")
plt.show()


## Handling Class Imbalance with SMOTE

SMOTE is applied to balance the dataset so that both target classes have equal representation.

In [None]:

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

print("Balanced training class distribution:\n", pd.Series(y_train_bal).value_counts())


## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

log_reg = LogisticRegression(max_iter=100, random_state=42)
log_reg.fit(X_train_bal, y_train_bal)



In [None]:
# Making Predictions

y_pred_lr = log_reg.predict(X_test_scaled)


In [None]:
# Evaluating Model Performance

print("Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("Confusion Matrix:",confusion_matrix(y_test, y_pred_lr))


## Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

rf_model = RandomForestClassifier(n_estimators=10, random_state=42)
rf_model.fit(X_train_bal, y_train_bal)


In [None]:
# making predictions

y_pred_rf = rf_model.predict(X_test_scaled)


In [None]:
# Evaluating Model Performance

print("Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred_rf))

In [None]:
!pip install xgboost lightgbm

## XGBoost Model

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=50, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_bal, y_train_bal)

y_pred_xgb = xgb_model.predict(X_test_scaled)

print("XGBoost Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb))
print("Recall:", recall_score(y_test, y_pred_xgb))
print("F1 Score:", f1_score(y_test, y_pred_xgb))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred_xgb))



## LightGBM Model

In [None]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(n_estimators=50, random_state=42)
lgb_model.fit(X_train_bal, y_train_bal)

y_pred_lgb = lgb_model.predict(X_test_scaled)

print("LightGBM Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lgb))
print("Precision:", precision_score(y_test, y_pred_lgb))
print("Recall:", recall_score(y_test, y_pred_lgb))
print("F1 Score:", f1_score(y_test, y_pred_lgb))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred_xgb))


## Model Performance Comparison

We compare all models using accuracy, precision, recall, and F1-score to select the best one.


In [None]:
model_results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_xgb),
        accuracy_score(y_test, y_pred_lgb)
    ],
    'Precision': [
        precision_score(y_test, y_pred_lr),
        precision_score(y_test, y_pred_rf),
        precision_score(y_test, y_pred_xgb),
        precision_score(y_test, y_pred_lgb)
    ],
    'Recall': [
        recall_score(y_test, y_pred_lr),
        recall_score(y_test, y_pred_rf),
        recall_score(y_test, y_pred_xgb),
        recall_score(y_test, y_pred_lgb)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred_lr),
        f1_score(y_test, y_pred_rf),
        f1_score(y_test, y_pred_xgb),
        f1_score(y_test, y_pred_lgb)
    ]
})

model_results.sort_values(by='F1 Score', ascending=False, inplace=True)
model_results.reset_index(drop=True, inplace=True)
model_results


## Selecting the best model

In [None]:
# based on F1 score and recall
best_model = log_reg

## Final model evaluation

In [None]:
#  using roc(receiver operating characteristic) curve and auc(area under curve) score

from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
# predicting probabilites 

y_probs = best_model.predict_proba(X_test_scaled)[:, 1]  # probability for class 1


## ROC Curve and AUC Score

The best-performing model is evaluated using ROC curve and AUC to measure classification performance.


In [None]:
# computing roc curve and auc score

fpr, tpr, thresholds = roc_curve(y_test, y_probs)
auc_score = roc_auc_score(y_test, y_probs)

print("AUC Score:", auc_score)


In [None]:
# plotting roc curve

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.3f}")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # diagonal
plt.title("ROC Curve - Logistic Regression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.show()


## Challenges Report

### Challenges Faced During the Project

- The dataset had anonymized features, so meaningful EDA was limited.
- The target classes were imbalanced (fewer 1s than 0s).
- Model training was slow (especially for Random Forest and XGBoost).
- Choosing the best model was tricky due to metric trade-offs (accuracy vs recalll vs F1).                           

### Techniques Used to Solve Challenges

- Skipped full EDA, followed guidelines.
- Used SMOTE to balance class distribution.
- Reduced n_estimators in models to speed up training.
- Used F1-score and recall instead of just accuracy to choose the best model.


## Final Summary and Conclusion

### summary and conclusion

This project aimed to build a model that predicts whether a customer will make a transaction in the future. The dataset contained 200 anonymized features and a binary target column. Due to the lack of feature names, we followed the project guideline to keep EDA minimal and focused on modeling.

We trained and evaluated four models: Logistic Regression, Random Forest, XGBoost, and LightGBM. After comparing their performance, we selected  Logistic Regression as the final model based on its high recall and F1-score, which are important for identifying potential customers who will make a transaction.
                                                                                                                                                                                                                                                  
The final model was evaluated using the ROC curve and AUC score, confirming its good classification ability. Overall, the project successfully met its objectives, and the selected model can help in identifying target customers for future campaigns.


### Possible Future Improvements:
- Using domain-specific data with named features for deeper analysis.
- Trying advanced techniques like feature selection or ensemble stacking.
- Deploying the model into a live environment for real-time predictions.
                                             