# FEATURE ENGINEERING

In [3]:
# Reloading dataset in chunks to process large data
CreditCardData = pd.read_csv('../data/creditcard.csv', chunksize=50000)

X_chunks = []
y_chunks = []

scaler = StandardScaler()

for chunk in CreditCardData:
    chunk['Hour'] = (chunk['Time'] // 3600) % 24
    chunk['Amount_Scaled'] = scaler.fit_transform(chunk[['Amount']])
    chunk = chunk.drop(columns=['Time', 'Amount'])

    X_chunks.append(chunk.drop(columns=['Class']))
    y_chunks.append(chunk['Class'])

X = pd.concat(X_chunks, ignore_index=True)
y = pd.concat(y_chunks, ignore_index=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts()

((199364, 30),
 (85443, 30),
 0    199020
 1       344
 Name: Class, dtype: int64,
 0    85295
 1      148
 Name: Class, dtype: int64)

# SVM and Random Forest Models

# Importing SMOTE for handling class imbalance

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

balanced_class_distribution = y_train_smote.value_counts()
balanced_class_distribution

In [None]:

# Additional feature engineering examples
CreditCardData['Hour'] = (CreditCardData['Time'] // 3600) % 24
CreditCardData['Transaction_Frequency'] = CreditCardData.groupby('UserID')['Time'].transform('count')
CreditCardData['Cumulative_Amount'] = CreditCardData.groupby('UserID')['Amount'].transform('cumsum')

# Scaling the Amount feature
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
CreditCardData['Amount_Scaled'] = scaler.fit_transform(CreditCardData[['Amount']])
