In [8]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Loading dataset
credit_card_data = pd.read_csv('/content/creditcard.csv')
credit_card_data.head()
credit_card_data.tail()
credit_card_data.info()

# No. of missing values
credit_card_data.isnull().sum()

# Distribution of legit transactions and fraudulent transactions
credit_card_data['Class'].value_counts()

# Separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]
print(legit.shape)
print(fraud.shape)

# Statistical measures of the data
legit.Amount.describe()
fraud.Amount.describe()

# Compare the values for both transactions
credit_card_data.groupby('Class').mean()

# Build a sample dataset containing a similar distribution of normal transactions and fraudulent
legit_sample = legit.sample(n=367)

# Concatenating the two dataframes
new_dataset = pd.concat([legit_sample, fraud], axis=0)
new_dataset.head()
new_dataset.tail()
new_dataset['Class'].value_counts()
new_dataset.groupby("Class").mean()

# Splitting the data into Features and Target
x = new_dataset.drop(columns='Class', axis=1)
y = new_dataset['Class']
print(y)

# Split the data into train and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
print(x.shape, x_train.shape, x_test.shape)

# Model Training with Random Forest Classifier
rf_model = RandomForestClassifier()  # Create a RandomForestClassifier model
rf_model.fit(x_train, y_train)  # Fit the model to your data

# Predictions on training data
x_train_prediction_rf = rf_model.predict(x_train)
training_data_accuracy_rf = accuracy_score(x_train_prediction_rf, y_train)
print('Accuracy on training data with Random Forest:', training_data_accuracy_rf)

# Predictions on test data
x_test_prediction_rf = rf_model.predict(x_test)
test_data_accuracy_rf = accuracy_score(x_test_prediction_rf, y_test)
print("Accuracy on Test data with Random Forest:", test_data_accuracy_rf)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28