# Importing Modules

In [None]:
# Import the modules
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample


In [None]:
# Load the dataset
df = pd.read_csv('fraud_data.csv')

In [None]:
# To observe the dimensions of the dataset we're dealing with

df.shape

In [None]:
# Get insights about the different data types 

df.info()

In [None]:
# Observe the first 5 rows of the dataset

df.head()

In [None]:
# Get the number of columns
num_columns = len(df.columns)

# Print the number of columns
print("Number of columns:", num_columns)

In [None]:
# Get the number of rows
num_rows = len(df)

# Print the number of rows
print("Number of rows:", num_rows)

In [None]:
# Get the columns with NaN values and their counts
nan_columns = df.columns[df.isna().any()]
nan_counts = df[nan_columns].isna().sum()

# Print the columns with NaN values and their counts
for column, count in zip(nan_columns, nan_counts):
    print(f"Column '{column}' has {count} NaN values.")

# Data Preparation

In [None]:
# Get the columns with at least 30,000 NaN values
nan_columns = df.columns[df.isna().sum() >= 30000]

# Drop the columns with at least 1000 NaN values
df.drop(nan_columns, axis=1, inplace=True)

# Print the updated DataFrame
df.shape

In [None]:
# observing updated first 5 rows

df.head()

In [None]:
# # Get the columns with more than 1000 zeroes
# zero_columns = df.columns[(df == 0).sum() > 30000]

# # Drop the columns with more than 1000 zeroes
# df.drop(zero_columns, axis=1, inplace=True)

# # Print the updated DataFrame
# df.shape

In [None]:
# Get the columns with NaN values and their counts
nan_columns = df.columns[df.isna().any()]
nan_counts = df[nan_columns].isna().sum()

# Print the columns with NaN values and their counts
for column, count in zip(nan_columns, nan_counts):
    print(f"Column '{column}' has {count} NaN values.")

In [None]:
# Get the numerical columns
numerical_columns = df.select_dtypes(include='number').columns

# Replace NaN values with column means
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())


In [None]:
# Get Remaining columns with NaN values and their counts
nan_columns = df.columns[df.isna().any()]
nan_counts = df[nan_columns].isna().sum()

# Print the columns with NaN values and their counts
for column, count in zip(nan_columns, nan_counts):
    print(f"Column '{column}' has {count} NaN values.")

In [None]:
# Get the columns with NaN values and their counts
nan_columns = df.columns[df.isna().any()]
nan_counts = df[nan_columns].isna().sum()

# Get the data types of the columns
column_types = df[nan_columns].dtypes

# Print the columns with NaN values, their counts, and data types
for column, count, dtype in zip(nan_columns, nan_counts, column_types):
    print(f"Column '{column}' has {count} NaN values. Data type: {dtype}")

In [None]:
nan_columns = df.columns[df.isna().any()]

# Replace NaN values with the mode
for column in nan_columns:
    mode_value = df[column].mode().iloc[0]
    df[column].fillna(mode_value, inplace=True)



In [None]:
# Get Remaining columns with NaN values and their counts
nan_columns = df.columns[df.isna().any()]
nan_counts = df[nan_columns].isna().sum()

# Print the columns with NaN values and their counts
for column, count in zip(nan_columns, nan_counts):
    print(f"Column '{column}' has {count} NaN values.")

In [None]:
# Check if there are any remaining NaN values
has_nan = df.isnull().values.any()

# Print the result
if has_nan:
    print("There are still NaN values in the dataset.")
else:
    print("There are no NaN values in the dataset.")

In [None]:
# # Dropping Columns with all zeroes

# # Drop the specified columns
# columns_to_drop = ['column1', 'column2', 'column3']
# df = df.drop(columns_to_drop, axis=1)

# # Use the modified DataFrame for further analysis or modeling

# # Check if all entries in a specific column are zeroes
# column_name = 'V321'
# are_all_zeroes = (df[column_name] == 0).all()

# # Print the result
# if are_all_zeroes:
#     print("All entries in '{}' column are zeroes.".format(column_name))
# else:
#     print("Not all entries in '{}' column are zeroes.".format(column_name))

# # Check how many entries in a specific column are zeroes
# import pandas as pd

# # Read the dataset into a DataFrame
# df = pd.read_csv('your_dataset.csv')

# # Check how many entries in multiple columns are zeroes
# columns_to_check = ['V321', 'V320', 'V319']
# num_zero_entries = (df[columns_to_check] == 0).all(axis=1).sum()

# # Print the result
# print("Number of zero entries across the specified columns: ", num_zero_entries)

In [None]:
# Check for duplicates
duplicates = df.duplicated()

# Print the result
if duplicates.any():
    print("There are duplicates in the dataset.")
else:
    print("There are no duplicates in the dataset.")

In [None]:
df.shape

# Data Visualization

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# if for in df == 1:
# print(element)

In [None]:
# Histogram distribtution of isFraud
feature = df['isFraud']

# Create a histogram
plt.hist(feature, bins=10)  # Adjust the number of bins as needed

# Set labels and title
plt.xlabel('Feature')
plt.ylabel('Frequency')
plt.title('Histogram of isFraud')

# Display the plot
plt.show()

In [None]:
# Pie Chart for ProductCD

top10 = df['ProductCD'].value_counts()[:10]
plt.pie(top10, labels=top10.index, autopct="%1.1f%%")
plt.show()

In [None]:
# Pie Chart view for card4

top10 = df['card4'].value_counts()[:10]
plt.pie(top10, labels=top10.index, autopct="%1.1f%%")
plt.show()

In [None]:
# Visualizing the Distribution of TransactionID and card4
feature1 = df['card4']
feature2 = df['TransactionID']

# Create a scatter plot using Seaborn
sns.scatterplot(x=feature1, y=feature2)

# Set labels and title
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Scatter Plot of card4 vs TransactionID')

# Display the plot
plt.show()

# Data Preprocessing

In [None]:
# Handling Undersampling in isFraud

# Separate the majority and minority classes
majority_class = df[df['isFraud'] == 0]
minority_class = df[df['isFraud'] == 1]

# Undersample the majority class
undersampled_majority = resample(majority_class,
                                replace=False,  # set to False for undersampling
                                n_samples=len(minority_class),  # match minority class size
                                random_state=42)  # for reproducibility

# Combine the undersampled majority class with the minority class
undersampled_df = pd.concat([undersampled_majority, minority_class])

# Shuffle the dataset
undersampled_df = undersampled_df.sample(frac=1, random_state=42)

# Use the undersampled dataset for further analysis or modeling

In [None]:
# New Histogram distribtution of isFraud
feature = undersampled_df['isFraud']

# Create a histogram
plt.hist(feature, bins=10)  # Adjust the number of bins as needed

# Set labels and title
plt.xlabel('isFraud')
plt.ylabel('Frequency')
plt.title('Histogram of isFraud')

# Display the plot
plt.show()

In [None]:
# Assign the features and the label to separate variables

X = df.drop("isFraud", axis=1) # Drop the label column and assign the rest to X
y = df.loc[:, "isFraud"] # Select the label column and assign it to y
feature_names = X.columns # Get the names of the feature columns


In [None]:
# Encode the categorical variables with one-hot encoding
X = pd.get_dummies(X)

In [None]:
#Scaling the numerical variables with min-max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
# Splitting data into train and test set 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Building 

In [None]:
# Create and train a decision tree model

dtree = DecisionTreeRegressor(criterion="mse", max_depth=3, random_state=42)
dtree.fit(X_train, y_train)


# # pip install -U scikit-learn==0.20.

In [None]:
# Evaluate the performance of the model

y_pred = dtree.predict(X_test)
print("Mean squared error:", mean_squared_error(y_test, y_pred))
print("Mean absolute error:", mean_absolute_error(y_test, y_pred))
print("R2 score:", r2_score(y_test, y_pred))

In [None]:
# Visualize the tree structure
plt.figure(figsize=(12,8))
plot_tree(dtree, feature_names=feature_names, filled=True)
plt.show()

In [None]:
# Visualize the feature importance
plt.figure(figsize=(8,6))
plt.barh(feature_names, dtree.feature_importances_)
plt.xlabel("Feature importance")
plt.ylabel("Feature name")
plt.show()

In [None]:
# Visualize the data distribution
plt.figure(figsize=(10,8))
sns.distplot(y, bins=20)
plt.xlabel("Fraud probability")
plt.ylabel("Density")
plt.show()

In [None]:
# Visualize the correlation matrix
plt.figure(figsize=(10,10))
sns.heatmap(X.corr(), annot=True, cmap="coolwarm")
plt.show()

In [None]:
# Visualize the scatter plots of the features vs the target
fig, axes = plt.subplots(3, 4, figsize=(15,12))
for i, ax in enumerate(axes.flat):
    if i < len(feature_names):
        ax.scatter(X.iloc[:, i], y, alpha=0.5)
        ax.set_xlabel(feature_names[i])
        ax.set_ylabel("Fraud probability")
plt.tight_layout()
plt.show()