<a href="https://colab.research.google.com/github/Roja0230/MachineLearning-CodSoft/blob/main/Credit_Card_Fraud_Detection_ipynd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install & download dataset from Kaggle
# ======================
!pip install kagglehub --upgrade
import kagglehub
import pandas as pd




In [None]:
# Download dataset
path = kagglehub.dataset_download("kartik2112/fraud-detection")
print("Dataset path:", path)

# Load CSV (choose smaller for speed in Colab)
df = pd.read_csv(f"{path}/fraudTest.csv")
print(df.head())
print(df.info())

Dataset path: /kaggle/input/fraud-detection
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2020-06-21 12:14:25  2291163933867244   
1           1   2020-06-21 12:14:33  3573030041201292   
2           2   2020-06-21 12:14:53  3598215285024754   
3           3   2020-06-21 12:15:15  3591919803438423   
4           4   2020-06-21 12:15:17  3526826139003047   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F

In [None]:
# 2. Data preparation
# ======================
# Keep only numeric columns for ML
df = df.select_dtypes(include=['number'])

# Check target variable
print(df['is_fraud'].value_counts())

# Rename target for consistency
df.rename(columns={'is_fraud': 'Class'}, inplace=True)


is_fraud
0    553574
1      2145
Name: count, dtype: int64


In [None]:
# 3. Handle imbalance (undersample for speed)
# ======================
fraud = df[df['Class'] == 1]
legit = df[df['Class'] == 0].sample(n=len(fraud), random_state=42)
df_balanced = pd.concat([fraud, legit]).sample(frac=1, random_state=42)

X = df_balanced.drop('Class', axis=1)
y = df_balanced['Class']


In [None]:
# 4. Train/Test split
# ======================
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# 5. Train models
# ======================
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

best_model = None
best_auc = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    auc = roc_auc_score(y_test, preds)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("ROC-AUC:", auc)
    print(classification_report(y_test, preds, zero_division=0))

    if auc > best_auc:
        best_auc = auc
        best_model = model

print(f"\n✅ Best model: {type(best_model).__name__} with ROC-AUC {best_auc:.4f}")



=== Logistic Regression ===
Accuracy: 0.5
ROC-AUC: 0.5
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       429
           1       0.00      0.00      0.00       429

    accuracy                           0.50       858
   macro avg       0.25      0.50      0.33       858
weighted avg       0.25      0.50      0.33       858


=== Decision Tree ===
Accuracy: 0.8939393939393939
ROC-AUC: 0.8939393939393939
              precision    recall  f1-score   support

           0       0.90      0.89      0.89       429
           1       0.89      0.90      0.89       429

    accuracy                           0.89       858
   macro avg       0.89      0.89      0.89       858
weighted avg       0.89      0.89      0.89       858


=== Random Forest ===
Accuracy: 0.9324009324009324
ROC-AUC: 0.9324009324009324
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       429
           1       0.

In [None]:
# 6. Save model
# ======================
import joblib
joblib.dump(best_model, "credit_fraud_model.joblib")
from google.colab import files
files.download("credit_fraud_model.joblib")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 7. Predict a new transaction
# ======================
def predict_transaction(transaction_data):
    transaction_df = pd.DataFrame([transaction_data], columns=X.columns)
    return best_model.predict(transaction_df)[0]

# Example prediction (using first row from test set)
sample_data = list(X_test.iloc[0])
print("Prediction (0=legit, 1=fraud):", predict_transaction(sample_data))


Prediction (0=legit, 1=fraud): 0
