### Data Exploration
**Business Context:** First look at CRM Sales Opportunities Dataset<br>
**Objective:** Understand dataset structure, content, and business domain<br>
**Expected Outcome:** Foundation knowledge for preprocessing phase

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
from tabulate import tabulate
from rich.console import Console
from rich.text import Text

console = Console()
warnings.filterwarnings('ignore')

In [None]:
# Set up the plotting parameters

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("COMP647 CRM Sales Opportunities - Data Exploration")
print("=" * 60)
print("Multi-table Dataset Overview")
print("Kaggle Source: https://www.kaggle.com/datasets/innocentmfa/crm-sales-opportunities/data")
print("=" * 60)

COMP647 CRM Sales Opportunities - Data Exploration
Multi-table Dataset Overview
Kaggle Source: https://www.kaggle.com/datasets/innocentmfa/crm-sales-opportunities/data


#### Load all the 5-table CRM database

In [15]:
print("CRM Database Tables")
print("-" * 40)

# file paths 
data_path = Path('../data/raw/') 

# Load all tables
try:
    accounts = pd.read_csv(data_path / 'accounts.csv')
    products = pd.read_csv(data_path / 'products.csv') 
    sales_pipeline = pd.read_csv(data_path / 'sales_pipeline.csv')
    sales_teams = pd.read_csv(data_path / 'sales_teams.csv')
    data_dictionary = pd.read_csv(data_path / 'data_dictionary.csv')
    
    print("All 5 CRM tables loaded successfully!")
    
except FileNotFoundError as e:
    print(f"File not found: {e}")

CRM Database Tables
----------------------------------------
All 5 CRM tables loaded successfully!


#### Dataset overview

In [16]:
print("=" * 60)
print("CRM DATABASE OVERVIEW")
print("=" * 60)

# Create summary dictionary
tables_info = {
    'accounts': accounts,
    'products': products,
    'sales_pipeline': sales_pipeline, 
    'sales_teams': sales_teams,
    'data_dictionary': data_dictionary
}

print(f"Business Domain: Customer Relationship Management (CRM)")
print(f"Database Type: Relational (5 interconnected tables)")
print(f"Primary Use Case: Sales opportunity tracking and analysis")

print(f"\nTable Summary:")
total_rows = 0
total_columns = 0

for name, df in tables_info.items():
    print(f"{name:<15}:    {df.shape[0]:>5,} rows × {df.shape[1]:>2} columns")
    total_rows += df.shape[0]
    total_columns += df.shape[1]

print(f"\nDatabase Totals: {total_rows:,} total rows, {total_columns} total columns")


CRM DATABASE OVERVIEW
Business Domain: Customer Relationship Management (CRM)
Database Type: Relational (5 interconnected tables)
Primary Use Case: Sales opportunity tracking and analysis

Table Summary:
accounts       :       85 rows ×  7 columns
products       :        7 rows ×  3 columns
sales_pipeline :    8,800 rows ×  8 columns
sales_teams    :       35 rows ×  3 columns
data_dictionary:       21 rows ×  3 columns

Database Totals: 8,948 total rows, 24 total columns


#### Database business context - Table representations

In [17]:
print("\n" + "=" * 60)
print("BUSINESS CONTEXT - TABLE PURPOSES")
print("=" * 60)

# Display data dictionary first for context
print("\nDATA DICTIONARY - Field Definitions:")
print("-" * 45)
if not data_dictionary.empty:
    print(data_dictionary.to_string(index=False))
else:
    print("Data dictionary is empty or not properly loaded")

print(f"\nTABLE RELATIONSHIPS & BUSINESS PURPOSE:")
print("-" * 50)

business_context = {
    'sales_pipeline': {
        'purpose': 'PRIMARY TABLE - Individual sales opportunities/deals',
        'business_value': 'Track each potential sale from engagement to close',
        'key_fields': 'opportunity_id, deal_stage (Won/Lost), close_value'
    },
    'accounts': {
        'purpose': 'Customer master data - Company information', 
        'business_value': 'Customer profiles, industry sectors, company size',
        'key_fields': 'account, sector, revenue, employees'
    },
    'products': {
        'purpose': 'Product catalog - What the company sells',
        'business_value': 'Product portfolio with pricing information', 
        'key_fields': 'product, series, sales_price'
    },
    'sales_teams': {
        'purpose': 'Sales organization - Who handles the sales',
        'business_value': 'Sales team structure, territories, management hierarchy',
        'key_fields': 'sales_agent, manager, regional_office'
    },
    'data_dictionary': {
        'purpose': 'Documentation - Field definitions',
        'business_value': 'Professional documentation of all database fields',
        'key_fields': 'Table, Field, Description'
    }
}

for table_name, info in business_context.items():
    if table_name in tables_info:
        df = tables_info[table_name]
        print(f"\n{info['purpose']}")
        print(f"Business Value: {info['business_value']}")
        print(f"Key Fields: {info['key_fields']}")
        print(f"Data Size: {df.shape[0]:,} records")


BUSINESS CONTEXT - TABLE PURPOSES

DATA DICTIONARY - Field Definitions:
---------------------------------------------
         Table            Field                                                Description
      accounts          account                                               Company name
      accounts           sector                                                   Industry
      accounts year_established                                           Year Established
      accounts          revenue                        Annual revenue (in millions of USD)
      accounts        employees                                        Number of employees
      accounts  office_location                                               Headquarters
      accounts    subsidiary_of                                             Parent company
      products          product                                               Product name
      products           series                               

#### Understanding the data - Data preview

In [18]:

print("=" * 60)
print("SAMPLE DATA PREVIEW")
print("=" * 60)

for table_name, df in tables_info.items():
    print(f"\n{table_name.upper()} - Sample Records:")
    print("-" * 60)

    # Show column names
    print(f"Columns ({len(df.columns)}): {list(df.columns)}\n")

    if len(df) > 0:
        # Display first 5 rows
        sample_df = df.head(5)

        # Use tabulate for pretty table printing
        print(tabulate(sample_df, headers="keys", tablefmt="grid", showindex=False))

        if len(df.columns) > 10:
            print(f"\n({len(df.columns)-10} more columns)")
    else:
        print("Table is empty")


SAMPLE DATA PREVIEW

ACCOUNTS - Sample Records:
------------------------------------------------------------
Columns (7): ['account', 'sector', 'year_established', 'revenue', 'employees', 'office_location', 'subsidiary_of']

+------------------+-----------+--------------------+-----------+-------------+-------------------+-----------------+
| account          | sector    |   year_established |   revenue |   employees | office_location   |   subsidiary_of |
| Acme Corporation | technolgy |               1996 |   1100.04 |        2822 | United States     |             nan |
+------------------+-----------+--------------------+-----------+-------------+-------------------+-----------------+
| Betasoloin       | medical   |               1999 |    251.41 |         495 | United States     |             nan |
+------------------+-----------+--------------------+-----------+-------------+-------------------+-----------------+
| Betatech         | medical   |               1986 |    647.18 |  

#### Initial data quality check

In [19]:
print("\n" + "=" * 60)
print("INITIAL DATA QUALITY CHECK")
print("=" * 60)

for table_name, df in tables_info.items():
    print(f"\n{table_name.upper()} - Quality Overview:")
    print("-" * 30)
    
    if len(df) > 0:
        # Basic quality metrics
        total_cells = df.size
        missing_cells = df.isnull().sum().sum()
        missing_pct = (missing_cells / total_cells) * 100
        
        print(f"Completeness: {100-missing_pct:.1f}% ({total_cells-missing_cells:,}/{total_cells:,} cells)")
        print(f"Missing Values: {missing_cells:,} ({missing_pct:.1f}%)")
        
        # Check for asterisk placeholders 
        asterisk_count = 0
        for col in df.columns:
            asterisk_count += (df[col].astype(str) == '*').sum()
        
        if asterisk_count > 0:
            print(f"Asterisk Placeholders: {asterisk_count} (need conversion to NaN)")
        
        # Check data types
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        object_cols = df.select_dtypes(include=['object']).columns
        
        print(f"Numeric Columns: {len(numeric_cols)}")
        print(f"Text/Object Columns: {len(object_cols)}")
        
        # Check for duplicates
        duplicates = df.duplicated().sum()
        if duplicates > 0:
            print(f"Duplicate Rows: {duplicates}")
    else:
        print("Table is empty - cannot assess quality")


INITIAL DATA QUALITY CHECK

ACCOUNTS - Quality Overview:
------------------------------
Completeness: 88.2% (525/595 cells)
Missing Values: 70 (11.8%)
Numeric Columns: 3
Text/Object Columns: 4

PRODUCTS - Quality Overview:
------------------------------
Completeness: 100.0% (21/21 cells)
Missing Values: 0 (0.0%)
Numeric Columns: 1
Text/Object Columns: 2

SALES_PIPELINE - Quality Overview:
------------------------------
Completeness: 91.3% (64,297/70,400 cells)
Missing Values: 6,103 (8.7%)
Numeric Columns: 1
Text/Object Columns: 7

SALES_TEAMS - Quality Overview:
------------------------------
Completeness: 100.0% (105/105 cells)
Missing Values: 0 (0.0%)
Numeric Columns: 0
Text/Object Columns: 3

DATA_DICTIONARY - Quality Overview:
------------------------------
Completeness: 100.0% (63/63 cells)
Missing Values: 0 (0.0%)
Numeric Columns: 0
Text/Object Columns: 3


#### Database relationship preview

In [20]:
print("\n" + "=" * 60)
print("TABLE RELATIONSHIPS PREVIEW")
print("=" * 60)

# Expected foreign key relationships based on business logic
relationships = [
    {
        'from': 'sales_pipeline',
        'to': 'accounts', 
        'key': 'account',
        'description': 'Each opportunity belongs to a customer account'
    },
    {
        'from': 'sales_pipeline',
        'to': 'products',
        'key': 'product', 
        'description': 'Each opportunity is for a specific product'
    },
    {
        'from': 'sales_pipeline',
        'to': 'sales_teams',
        'key': 'sales_agent',
        'description': 'Each opportunity is handled by a sales agent'
    },
    {
        'from': 'accounts',
        'to': 'accounts',
        'key': 'subsidiary_of',
        'description': 'Some accounts are subsidiaries of other accounts'
    }
]

print("Expected Relationships (to validate in preprocessing):")
for rel in relationships:
    print(f"{rel['from']} → {rel['to']} via '{rel['key']}'")
    print(f"{rel['description']}")

# Quick validation check
print(f"\nRelationship Validation Preview:")
if len(sales_pipeline) > 0 and len(accounts) > 0:
    # Check account relationship
    pipeline_accounts = set(sales_pipeline['account'].dropna())
    master_accounts = set(accounts['account'].dropna()) 
    unmatched_accounts = pipeline_accounts - master_accounts
    
    print(f"Account Relationship:")
    print(f"• Opportunities reference {len(pipeline_accounts)} unique accounts")
    print(f"• Master account table has {len(master_accounts)} accounts")
    if unmatched_accounts:
        print(f"{len(unmatched_accounts)} opportunities reference accounts not in master table")
    else:
        print(f"All opportunity accounts found in master table")



TABLE RELATIONSHIPS PREVIEW
Expected Relationships (to validate in preprocessing):
sales_pipeline → accounts via 'account'
Each opportunity belongs to a customer account
sales_pipeline → products via 'product'
Each opportunity is for a specific product
sales_pipeline → sales_teams via 'sales_agent'
Each opportunity is handled by a sales agent
accounts → accounts via 'subsidiary_of'
Some accounts are subsidiaries of other accounts

Relationship Validation Preview:
Account Relationship:
• Opportunities reference 85 unique accounts
• Master account table has 85 accounts
All opportunity accounts found in master table


#### Initial business insight

In [21]:
print("\n" + "=" * 60)
print("INITIAL BUSINESS INSIGHTS")
print("=" * 60)

if len(sales_pipeline) > 0:
    print("SALES OPPORTUNITIES - Key Metrics:")
    print("-" * 35)
    
    # Deal stage analysis
    if 'deal_stage' in sales_pipeline.columns:
        deal_stages = sales_pipeline['deal_stage'].value_counts()
        print(f"Deal Stages:")
        for stage, count in deal_stages.items():
            pct = (count/len(sales_pipeline))*100
            print(f"• {stage}: {count:,} ({pct:.1f}%)")
    
    # Value analysis (if close_value exists)
    if 'close_value' in sales_pipeline.columns:
        values = sales_pipeline['close_value'].dropna()
        if len(values) > 0:
            print(f"\nDeal Values:")
            print(f"• Total Pipeline Value: ${values.sum():,.0f}")
            print(f"• Average Deal Size: ${values.mean():,.0f}")
            print(f"• Median Deal Size: ${values.median():,.0f}")

if len(accounts) > 0 and 'sector' in accounts.columns:
    print(f"\nCUSTOMER BASE - Industry Distribution:")
    print("-" * 40)
    sectors = accounts['sector'].value_counts().head(5)
    for sector, count in sectors.items():
        pct = (count/len(accounts))*100
        print(f"• {sector}: {count} companies ({pct:.1f}%)")

if len(products) > 0:
    print(f"\nPRODUCT PORTFOLIO:")
    print("-" * 20)
    print(f"• Total Products: {len(products)}")
    if 'series' in products.columns:
        series_count = products['series'].nunique()
        print(f"• Product Series: {series_count}")


INITIAL BUSINESS INSIGHTS
SALES OPPORTUNITIES - Key Metrics:
-----------------------------------
Deal Stages:
• Won: 4,238 (48.2%)
• Lost: 2,473 (28.1%)
• Engaging: 1,589 (18.1%)
• Prospecting: 500 (5.7%)

Deal Values:
• Total Pipeline Value: $10,005,534
• Average Deal Size: $1,491
• Median Deal Size: $472

CUSTOMER BASE - Industry Distribution:
----------------------------------------
• retail: 17 companies (20.0%)
• technolgy: 12 companies (14.1%)
• medical: 12 companies (14.1%)
• marketing: 8 companies (9.4%)
• finance: 8 companies (9.4%)

PRODUCT PORTFOLIO:
--------------------
• Total Products: 7
• Product Series: 3


In [None]:
# Create tables_clean dictionary from existing variables
tables_clean = {
    'accounts': accounts.copy(),
    'products': products.copy(),
    'sales_pipeline': sales_pipeline.copy(), 
    'sales_teams': sales_teams.copy(),
    'data_dictionary': data_dictionary.copy()
}