In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the CSV file (assuming it's in the current directory)
transaction_data_raw = pd.read_csv("card_transdata (1).csv")

# Drop all rows with null values
transaction_data_cleaned = transaction_data_raw.dropna()

# Save the cleaned dataset to a new CSV file
transaction_data_cleaned.to_csv("transaction_data_cleaned.csv", index=False)

# Display the head of the cleaned dataset
print("HEAD OF CLEANED DATASET:")
print(transaction_data_cleaned.head())

# Confirm imports for visualization
print("\nSeaborn and Matplotlib have been successfully imported and are ready for use.")

# Function to create a countplot with percentages
def plot_count_with_percentages(df, column_name, title):
    plt.figure(figsize=(8, 6))
    ax = sns.countplot(data=df, x=column_name)
    plt.title(title)
    plt.xlabel(column_name.replace('_', ' ').title())
    plt.ylabel('Count')
    total = len(df)
    for p in ax.patches:
        count = p.get_height()
        percentage = 100 * count / total
        ax.annotate(f'{percentage:.2f}%', (p.get_x() + p.get_width() / 2., count),
                    ha='center', va='bottom', fontsize=12)
    plt.show()  # ensures the plot is displayed

# --------------------------
# Fraud Class Distribution
# --------------------------
plot_count_with_percentages(transaction_data_cleaned, 'fraud', 'Fraud Class Distribution')

# --------------------------
# Used Pin Distribution
# --------------------------
plot_count_with_percentages(transaction_data_cleaned, 'used_pin_number', 'Used Pin Distribution')

# --------------------------
# Repeat Retailer Distribution
# --------------------------
plot_count_with_percentages(transaction_data_cleaned, 'repeat_retailer', 'Repeat Retailer Distribution')

# --------------------------
# Online Orders Distribution
# --------------------------
plot_count_with_percentages(transaction_data_cleaned, 'online_order', 'Online Orders')

# --------------------------
# Ratio to Median Purchase Price Distribution
# --------------------------
plt.figure(figsize=(10, 6))
sns.histplot(data=transaction_data_cleaned, x='ratio_to_median_purchase_price', bins=30, kde=True)
plt.title('Ratio to Median Purchase Price Distribution')
plt.xlabel('Ratio to Median Purchase Price')
plt.ylabel('Frequency')
plt.show()

# --------------------------
# Distance From Home Distribution
# --------------------------
plt.figure(figsize=(10, 6))
sns.histplot(data=transaction_data_cleaned, x='distance_from_home', bins=30, kde=True)
plt.title('Distance From Home Distribution')
plt.xlabel('Distance From Home')
plt.ylabel('Frequency')
plt.show()

# --------------------------
# Used Pin Number vs Fraud
# --------------------------
plt.figure(figsize=(10, 6))
ax = sns.countplot(
    data=transaction_data_cleaned,
    x='used_pin_number',
    hue='fraud',
    palette=['green', 'red']
)
plt.title('Transactions with PIN Fraudulent Transactions')
plt.xlabel('Used Pin')
plt.ylabel('Count')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['Non-Fraudulent', 'Fraudulent'], title='Fraud')
plt.show()

# --------------------------
# Percent of Fraudulent Transactions by Used PIN
# --------------------------
df_pin_fraud = transaction_data_cleaned.groupby(['used_pin_number', 'fraud']).size().unstack(fill_value=0)
df_pin_fraud_percent = df_pin_fraud.div(df_pin_fraud.sum(axis=1), axis=0) * 100
ax = df_pin_fraud_percent.plot(kind='bar', stacked=True, color=['green', 'red'], figsize=(5, 3))
plt.title('Percent of Fraudulent Transactions with PIN Usage')
plt.xlabel('Used PIN')
plt.ylabel('Percentage')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['Non-Fraudulent', 'Fraudulent'], title='Fraud')
plt.show()

# --------------------------
# Used Chip vs Fraud
# --------------------------
plt.figure(figsize=(10, 6))
ax = sns.countplot(
    data=transaction_data_cleaned,
    x='used_chip',
    hue='fraud',
    palette=['green', 'red']
)
plt.title('Transactions with Chip vs Fraudulent Transactions')
plt.xlabel('Used Chip')
plt.ylabel('Count')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['Non-Fraudulent', 'Fraudulent'], title='Fraud')
plt.show()

# --------------------------
# Percent of Fraudulent Transactions by Chip Usage
# --------------------------
df_chip_fraud = transaction_data_cleaned.groupby(['used_chip', 'fraud']).size().unstack(fill_value=0)
df_chip_fraud_percent = df_chip_fraud.div(df_chip_fraud.sum(axis=1), axis=0) * 100
ax = df_chip_fraud_percent.plot(kind='bar', stacked=True, color=['green', 'red'], figsize=(5, 3))
plt.title('Percent of Fraudulent Transactions with Chip Usage')
plt.xlabel('Used Chip')
plt.ylabel('Percentage')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['Non-Fraudulent', 'Fraudulent'], title='Fraud')
plt.show()

# --------------------------
# Online Order vs Fraud
# --------------------------
plt.figure(figsize=(10, 6))
ax = sns.countplot(
    data=transaction_data_cleaned,
    x='online_order',
    hue='fraud',
    palette=['green', 'red']
)
plt.title('Transactions with Online Order vs Fraudulent Transactions')
plt.xlabel('Online Order')
plt.ylabel('Count')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['Non-Fraudulent', 'Fraudulent'], title='Fraud')
plt.show()

# --------------------------
# Percent of Fraudulent Transactions by Online Order
# --------------------------
df_online_fraud = transaction_data_cleaned.groupby(['online_order', 'fraud']).size().unstack(fill_value=0)
df_online_fraud_percent = df_online_fraud.div(df_online_fraud.sum(axis=1), axis=0) * 100
ax = df_online_fraud_percent.plot(kind='bar', stacked=True, color=['green', 'red'], figsize=(5, 3))
plt.title('Percent of Fraudulent Transactions with Online Orders')
plt.xlabel('Online Order')
plt.ylabel('Percentage')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['Non-Fraudulent', 'Fraudulent'], title='Fraud')
plt.show()

# --------------------------
# Scatterplot: Distance From Last Transaction vs Ratio to Median Purchase Price
# --------------------------
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=transaction_data_cleaned,
    x='distance_from_last_transaction',
    y='ratio_to_median_purchase_price',
    hue='fraud',
    palette=['green', 'red'],
    alpha=0.6
)
plt.title('Distance from Last Transaction vs Ratio to Median Purchase Price')
plt.xlabel('Distance from Last Transaction')
plt.ylabel('Ratio to Median Purchase Price')
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles=handles, labels=['Non-Fraudulent', 'Fraudulent'], title='Fraud')
plt.show()

# --------------------------
# Scatterplot: Distance From Home vs Distance From Last Transaction by Fraud
# --------------------------
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=transaction_data_cleaned,
    x='distance_from_home',
    y='distance_from_last_transaction',
    hue='fraud',
    palette=['green', 'red'],
    alpha=0.6
)
plt.title('Distance From Home vs Distance From Last Transaction by Fraud Status')
plt.xlabel('Distance From Home')
plt.ylabel('Distance From Last Transaction')
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles=handles, labels=['Non-Fraudulent', 'Fraudulent'], title='Fraud')
plt.show()


ModuleNotFoundError: No module named 'pandas'