In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np  # Needed for variance ratio calculations

# Load the dataset
df = pd.read_csv("Fraud_Detection_Dataset1.csv")

# Filling missing values with the mean value of their respective columns
df.fillna(df.mean(), inplace=True)

# Checking for any remaining missing values (optional)
if df.isnull().values.any():
    print("Dataset has missing values. Consider alternative handling methods.")

# Normalizing the dataset features
features = df.columns.tolist()
features.remove('Response')  # Assuming 'Response' is the target variable
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Splitting the dataset
X = df[features]
y = df['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Address multicollinearity with PCA
pca = PCA()  # Initialize PCA without specifying components initially
pca.fit(X_train)  # Fit PCA on the training data

# Analyze variance explained by components (optional)
variance_ratio = pca.explained_variance_ratio_

# Choose a threshold for explained variance (e.g., 0.9)
threshold = 0.9  # Retain components explaining 90% of variance
n_components = np.where(variance_ratio.cumsum() >= threshold)[0][0] + 1

# Reduce dimensionality based on the threshold
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Filter out rows with target variable value 0.11603074 (optional, adjust as needed)
filtered_indices_train = y_train != 0.11603074
X_train_filtered = X_train_pca[filtered_indices_train]
y_train_filtered = y_train[filtered_indices_train]

filtered_indices_test = y_test != 0.11603074
X_test_filtered = X_test_pca[filtered_indices_test]
y_test_filtered = y_test[filtered_indices_test]

# Round the target variable to the nearest integer
y_train_rounded = y_train_filtered.round().astype(int)
y_test_rounded = y_test_filtered.round().astype(int)

In [None]:
# Instantiate and train the SVM model
best_model = SVC(kernel='rbf')  # Use radial basis function (RBF) kernel for non-linear classification
best_model.fit(X_train_filtered, y_train_rounded)

In [3]:
# Predictions on filtered test set
predictions = best_model.predict(X_test_filtered)

In [4]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test_rounded, predictions))
print("\nClassification Report:")
print(classification_report(y_test_rounded, predictions))

Confusion Matrix:
[[15281   136]
 [ 1447   483]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     15417
           1       0.78      0.25      0.38      1930

    accuracy                           0.91     17347
   macro avg       0.85      0.62      0.66     17347
weighted avg       0.90      0.91      0.89     17347



In [5]:
testdata= pd.read_csv("Fraud_Detection_Test_nik.csv")

In [6]:
# Load test data
testdata = pd.read_csv("Fraud_Detection_Test_nik.csv")

# Apply preprocessing and PCA to test data (assuming same preprocessing as training)
testdata.fillna(testdata.mean(), inplace=True)
testdata[features] = scaler.transform(testdata[features])  # Use fitted scaler
X_test_pca = pca.transform(testdata[features])

# Make predictions on test data
y_pred = best_model.predict(X_test_pca)

# Create a DataFrame with predictions
df_y = pd.DataFrame(y_pred, columns=['y_hat'])

# Display the first few rows of predictions
print(df_y.head())

   y_hat
0      0
1      0
2      0
3      0
4      0


In [7]:
df_y.to_csv('Team-8(2-svm).csv',index=False)