In [1]:
import pandas as pd
import numpy as np 

In [10]:
data = pd.read_csv("Dev_data_to_be_shared 3/Dev_data_to_be_shared.csv")

In [11]:
# Calculate the percentage of missing values for each column
percent = data.isna().sum() / len(data) * 100

# Identify columns with more than 50% missing values
columns_to_drop = percent[percent > 30].index

# Drop those columns from the DataFrame
data = data.drop(columns=columns_to_drop)

In [12]:
transaction_columns = [col for col in data.columns if col.startswith('transaction_attribute_')]
bureau_columns = [col for col in data.columns if col.startswith('bureau_') and not col.startswith('bureau_enquiry_')]
bureau_enquiry_columns = [col for col in data.columns if col.startswith('bureau_enquiry_')]
onus_attribute_columns = [col for col in data.columns if col.startswith('onus_attribute_')]

print(f"Number of transaction columns: {len(transaction_columns)}")
print(f"Number of bureau columns: {len(bureau_columns)}")
print(f"Number of bureau_enquiry_columns columns: {len(bureau_enquiry_columns)}")
print(f"Number of onus_attribute_columns columns: {len(onus_attribute_columns)}")

Number of transaction columns: 664
Number of bureau columns: 436
Number of bureau_enquiry_columns columns: 50
Number of onus_attribute_columns columns: 39


In [13]:
from sklearn.impute import SimpleImputer
bureau_imputer = SimpleImputer(strategy='median')
data[bureau_columns] = bureau_imputer.fit_transform(data[bureau_columns])

In [14]:
data[bureau_enquiry_columns] = data[bureau_enquiry_columns].fillna(0)

In [15]:
onus_imputer = SimpleImputer(strategy='median')
data[onus_attribute_columns] = onus_imputer.fit_transform(data[onus_attribute_columns])


In [16]:
transaction_imputer = SimpleImputer(strategy='median')
data[transaction_columns] = transaction_imputer.fit_transform(data[transaction_columns])

In [17]:
data.to_csv('pre_procced_train.csv', index = False)

###Similarly done for Validation set


In [None]:
df = data

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
# Define a threshold value (e.g., 0.01). This will remove features with variance less than 0.01.
threshold = 0.01
selector = VarianceThreshold(threshold=threshold)

In [None]:
# Apply the variance threshold to your dataset (excluding target column if present)
X = df.drop(columns=['bad_flag'], axis=1)  # Replace 'bad_flag' with the target column name if applicable
X_selected = selector.fit_transform(X)

# Get the selected feature names
selected_features = X.columns[selector.get_support()]


In [None]:
df_selected = pd.DataFrame(X_selected, columns=selected_features)
df_selected['bad_flag'] = df['bad_flag']  # Add back the target column

In [None]:
df_selected.info()

In [None]:
correlation_matrix = df_selected.drop(columns=['bad_flag']).corr()

In [None]:
# Set the correlation threshold
correlation_threshold = 0.9

# Find pairs of highly correlated features
correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > correlation_threshold:
            feature_name = correlation_matrix.columns[i]
            correlated_features.add(feature_name)

In [None]:
df_reduced = df_selected.drop(columns=correlated_features)

In [None]:
df_reduced.info()

In [None]:
df_reduced.to_csv('df_reduced.csv')

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
X = df_reduced.drop(columns=['bad_flag'])
y = df_reduced['bad_flag']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Apply PCA to reduce to 10 components
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)


In [None]:
# Convert the PCA-transformed data to a DataFrame
pca_columns = [f'PC{i+1}' for i in range(10)]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)

# Add the target column back
df_pca['bad_flag'] = y

# Save the reduced dataset
df_pca.to_csv('pca_reduced_features.csv', index=False)

In [None]:
explained_variance = pca.explained_variance_ratio_
cumulative_variance = explained_variance.cumsum()

print("Explained Variance by each component:", explained_variance)
print("Cumulative Variance:", cumulative_variance)

In [None]:
data = df_reduced

In [None]:
val = pd.read_csv('pre_procced_val.csv')

In [None]:
val.head()

In [None]:
data.drop(columns =['bad_flag'], inplace = True)

In [None]:
feature_val = pd.DataFrame()  # Initialize as an empty DataFrame
for col in data:
    feature_val[col] = val[col]  # Add columns from 'val' into 'feature_val'


In [None]:
feature_val

In [None]:
if set(data.columns) == set(feature_val.columns):
    print("The column names are the same in both datasets (order doesn't matter).")
else:
    print("The column names are different.")


In [None]:
feature_val.to_csv('feature_val.csv', index = False)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(feature_val)

In [None]:
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

In [None]:
pca_columns = [f'PC{i+1}' for i in range(10)]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)

In [None]:
df_pca.info()

In [None]:
df_pca.to_csv('val_pca.csv', index = False )

In [None]:
# Required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load the dataset
file_path = 'credit_card_fraud.csv'  # Update with your file path
data = pd.read_csv("pca_reduced_features.csv")

# Separate features and target
X = data.drop(columns=['bad_flag'])  # Replace 'bad_flag' with your target column name
y = data['bad_flag']

# Debugging: Check original shapes
print("Original X shape:", X.shape)
print("Original y shape:", y.shape)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Debugging: Check train-test split shapes
print("Train shapes (X, y):", X_train.shape, y_train.shape)
print("Test shapes (X, y):", X_test.shape, y_test.shape)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Debugging: Check resampled data shapes
print("After SMOTE (X, y):", X_train_resampled.shape, y_train_resampled.shape)
# Build the neural network model
model = Sequential([
    Dense(64, input_dim=X_train_resampled.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train_resampled, y_train_resampled,
    epochs=20,  # Adjust as needed
    batch_size=32,  # Adjust based on data size
    validation_split=0.2,
    verbose=1
)

# Evaluate on the test set
y_pred_prob = model.predict(X_test_scaled).flatten()
y_pred = (y_pred_prob > 0.5).astype(int)

# Metrics
print("\nAUC-ROC:", roc_auc_score(y_test, y_pred_prob))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Optional: Plot training history
import matplotlib.pyplot as plt

# Plot accuracy
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()



In [None]:
from sklearn.metrics import fbeta_score

# Calculate exact F2 score
f2_score = fbeta_score(y_test, y_pred, beta=1)
print("Exact F2 Score:", f2_score)