In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Create a sample dataset for demonstration
data = pd.DataFrame({
    'TransactionId': range(1, 101),
    'Amount': [1000 + i * 50 for i in range(100)],
    'Value': [abs(1000 + i * 50) for i in range(100)],
    'FraudResult': [0 if i % 2 == 0 else 1 for i in range(100)],
    'CountryCode': [1, 2, 3, 4] * 25,
    'ProductCategory': ['A', 'B', 'C', 'D'] * 25,
    'ChannelId': ['Web', 'Android', 'iOS', 'PayLater'] * 25,
})

# Display the first few rows
print("First few rows of the dataset:")
data.head()


In [None]:
# Get general information about the dataset
print("Dataset Info:")
data.info()

# Check for duplicates
duplicates_count = data.duplicated().sum()
print(f"\nNumber of Duplicates: {duplicates_count}")

# Summary statistics for numerical features
print("\nSummary Statistics (Numerical):")
print(data.describe())

# Summary statistics for categorical features
print("\nSummary Statistics (Categorical):")
print(data.describe(include=['object']))


In [None]:
# Plot histograms for numerical features
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(data[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()


In [None]:
# Plot count plots for categorical features
categorical_features = data.select_dtypes(include=['object']).columns
for col in categorical_features:
    plt.figure(figsize=(8, 4))
    sns.countplot(y=data[col], order=data[col].value_counts().index)
    plt.title(f"Frequency of {col}")
    plt.xlabel("Count")
    plt.ylabel(col)
    plt.show()


In [None]:
# Calculate the correlation matrix
corr = data[numerical_features].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)


In [None]:
# Plot boxplots to detect outliers in numerical features
for col in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=data[col])
    plt.title(f"Boxplot of {col}")
    plt.show()
