In [1]:
import pandas as pd
from sklearn.datasets import make_classification

# Generate synthetic dataset
X, y = make_classification(
    n_samples=10000,       # total rows
    n_features=30,         # 30 features like V1–V28 + Amount + Time
    n_informative=10,      # informative features
    n_redundant=10,        # correlated features
    n_repeated=0,
    n_classes=2,           # binary classification
    weights=[0.2, 0.8],    # 20% normal, 80% fraud
    class_sep=1.0,         # how separable the classes are
    random_state=42
)

# Create DataFrame
columns = [f"V{i}" for i in range(1, 29)] + ["Amount", "Time"]
df = pd.DataFrame(X, columns=columns)
df["Class"] = y

# Save to CSV
df.to_csv("synthetic_fraud_dataset.csv", index=False)

print("✅ Synthetic dataset created: synthetic_fraud_dataset.csv")
print(df["Class"].value_counts(normalize=True) * 100)
df.head()


✅ Synthetic dataset created: synthetic_fraud_dataset.csv
Class
1    79.65
0    20.35
Name: proportion, dtype: float64


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Amount,Time,Class
0,-2.30521,4.347656,-0.32936,0.298234,-3.507178,0.497877,2.447967,-0.101366,-3.131417,0.250371,...,3.281737,0.479747,-2.766327,2.954446,0.923357,-0.969324,-3.334091,0.779586,-2.001245,0
1,-2.038082,-2.42995,-0.273349,-0.02841,-2.975891,-0.446027,1.947866,1.19573,2.874835,1.630704,...,-1.683151,1.898047,1.983256,-0.51577,-0.687491,0.2238,0.785463,-0.503743,-0.624541,0
2,-0.530558,-1.606448,1.502739,-1.697422,-0.59649,-0.888969,-1.027933,1.567307,-0.632665,-1.61367,...,1.595802,-3.502459,-2.328606,3.585416,0.619704,0.029836,0.660407,1.945396,-1.688003,1
3,-0.360259,2.6797,-1.746379,-1.019945,0.407306,-0.475029,-6.054911,1.31807,-1.706405,-0.651991,...,-7.516264,-2.076363,0.965986,-1.256344,-1.802935,-0.325044,3.094407,2.109252,2.79897,0
4,-2.221488,-1.123726,0.450599,0.125272,-0.920813,-1.83977,-1.405678,1.806567,-0.224325,0.862431,...,-0.923898,3.685151,0.916399,-1.709956,-1.131552,0.478284,-2.332403,2.632292,-1.540528,0
