# Load raw files and importing

In [None]:
import pandas as pd
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt

fraud_raw=pd.read_csv(r'D:/Projects - AI & ML/fraud-detection-project/data/raw/Fraud_Data.csv', parse_dates=["signup_time", "purchase_time"])
ip_raw=pd.read_csv(r'D:/Projects - AI & ML/fraud-detection-project/data/raw/IpAddress_to_Country.csv')
credit_raw=pd.read_csv(r'D:/Projects - AI & ML/fraud-detection-project/data/raw/creditcard.csv')

# Initial Summary
Checks conducted for duplicates, missing and null values. Overviews of shape and datatypes

In [None]:
def initial_summary(df, name):
    print(f"\n=== {name} ===")
    
    # Shape
    print("Shape:", df.shape)
    
    # Data types
    print("\nData types:")
    print(df.dtypes)

    # Check if any nulls exist (overall)
    has_nulls = df.isnull().values.any()
    print(f"\nAny null values present? {has_nulls}")
    
    # If there are nulls, list columns + counts
    if has_nulls:
        null_counts = df.isnull().sum()
        print("Nulls per column:")
        print(null_counts[null_counts > 0])
    
    # Missing values
    print("\nMissing values per column:")
    print(df.isnull().sum())
    
    # Duplicate rows
    print("\nTotal duplicate rows:", df.duplicated().sum())
    
    # Preview
    print("\nFirst 5 rows:")
    display(df.head())
    
# 3. Run summary on each raw dataset
initial_summary(fraud_raw,  "Fraud Data")
initial_summary(credit_raw, "Credit-Card Data")
initial_summary(ip_raw,     "IP→Country Mapping")

# Check Class Imbalance in Raw Fraud and CC Data

In [None]:
# Define list of (DataFrame, name, target column)
datasets = [
    (fraud_raw,  "Fraud Data","class"),
    (credit_raw, "Credit-Card Data","Class")
]

for df, title, target in datasets:
    # 1. Print percentage distribution
    pct = df[target].value_counts(normalize=True) * 100
    print(f"\n{title} — % of each class:")
    print(pct.round(2))
    
    # 2. Visualize with countplot
    plt.figure(figsize=(6,4))
    sns.countplot(x=target, data=df)
    plt.title(f"{title} Class Distribution")
    plt.xlabel("Class Label")
    plt.ylabel("Count")
    plt.show()


# IP Map Distribution & Coverage

In [None]:
# 1. How many unique countries?
n_countries = ipmap_df['country'].nunique()
print(f"Unique countries in map: {n_countries}")

# 2. Top 10 countries by number of IP ranges
top10 = ipmap_df['country'].value_counts().head(10)
print("\nTop 10 countries by IP-range count:")
print(top10)

# 3. Bar plot of those top 10
plt.figure(figsize=(8,5))
sns.barplot(x=top10.values, y=top10.index, palette="magma")
plt.title("Top 10 Countries by IP-Range Count")
plt.xlabel("Number of Ranges")
plt.ylabel("Country")
plt.show()

# 4. IP coverage min/max
print("\nGlobal IP-range covered:")
print("Min lower bound:", ipmap_df['lower'].min())
print("Max upper bound:", ipmap_df['upper'].max())
