**Large datasets**

In [None]:
import pandas as pd
# Process data in chunks if memory is limited
chunk_size = 1000
for chunk in pd.read_csv('large_fintech_file.csv', chunksize=chunk_size):
    # Process each chunk
    pass

In [None]:
# For large datasets, use sampling:
fintech_df = large_fintech_df.sample(n=1000)  # Sample 1000 rows

**Descriptive** **Analytics**

In [None]:
import pandas as pd

# Process data in chunks and sample from each chunk
chunk_size = 1000
chunk_samples = []

for chunk in pd.read_csv('large_fintech_file.csv', chunksize=chunk_size):
    chunk_sample = chunk.sample(n=100)  # Sample 100 rows from the chunk
    chunk_samples.append(chunk_sample)

# Combine all sampled chunks into a single DataFrame
sampled_data = pd.concat(chunk_samples, ignore_index=True)


In [None]:
fintech_means = fintech_df.select_dtypes(include=[np.number]).mean()
for column, mean_value in fintech_means.items():
    print(f"{column}: ${mean_value:,.2f}" if 'balance' in column or 'income' in column or 'loan' in column
          else f"{column}: {mean_value:.2f}")

In [None]:
fintech_medians = fintech_df.select_dtypes(include=[np.number]).median()
for column, median_value in fintech_medians.items():
    print(f"{column}: ${median_value:,.2f}" if 'balance' in column or 'income' in column or 'loan' in column
          else f"{column}: {median_value:.2f}")

In [None]:
for column in fintech_df.select_dtypes(include=[np.number]).columns:
    mode_value = fintech_df[column].mode()
    if len(mode_value) > 0:
        print(f"{column}: {mode_value.iloc[0]:.2f}")
    else:
        print(f"{column}: No mode found")

In [None]:
for column in fintech_df.select_dtypes(include=[np.number]).columns:
    min_val = fintech_df[column].min()
    max_val = fintech_df[column].max()
    print(f"{column}: Min = {min_val:.2f}, Max = {max_val:.2f}")

In [None]:
fintech_std = fintech_df.select_dtypes(include=[np.number]).std()
for column, std_value in fintech_std.items():
    print(f"{column}: {std_value:.2f}")

In [None]:
quantiles = [0.25, 0.5, 0.75]
for column in ['account_balance', 'credit_score', 'annual_income']:
    print(f"\n{column}:")
    for q in quantiles:
        value = fintech_df[column].quantile(q)
        print(f"  {int(q*100)}th percentile: {value:.2f}")

In [None]:
def calculate_iqr(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    return q1, q3, iqr

print("\nFintech Dataset IQR:")
for column in ['account_balance', 'credit_score', 'loan_amount']:
    q1, q3, iqr = calculate_iqr(fintech_df, column)
    print(f"{column}: Q1={q1:.2f}, Q3={q3:.2f}, IQR={iqr:.2f}")

In [None]:
custom_quantiles = [0.1, 0.25, 0.5, 0.75, 0.9]

for column in ['age', 'blood_pressure_systolic', 'cholesterol']:
    print(f"\n{column}:")
    for q in custom_quantiles:
        value = healthcare_df[column].quantile(q)
        print(f"  {int(q*100)}th percentile: {value:.2f}")

**Data distribution histogram**

In [None]:
# Set up the plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)

# Create histograms for fintech data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Fintech Dataset - Distribution Analysis', fontsize=16, fontweight='bold')

# Account Balance
axes[0, 0].hist(fintech_df['account_balance'], bins=20, color='skyblue', alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Account Balance Distribution')
axes[0, 0].set_xlabel('Account Balance ($)')
axes[0, 0].set_ylabel('Frequency')

# matplotlib.use('Agg')
plt.tight_layout()
plt.show()

**Enhanced Histograms with Statistical Lines**

In [None]:
def create_enhanced_histogram(data, column, title, color='blue'):
    """Create histogram with statistical reference lines"""
    plt.figure(figsize=(10, 6))

    # Create histogram
    plt.hist(data[column], bins=20, color=color, alpha=0.7, edgecolor='black')

    # Calculate statistics
    mean_val = data[column].mean()
    median_val = data[column].median()
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)

    # Add vertical lines for statistics
    plt.axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
    plt.axvline(median_val, color='green', linestyle='--', linewidth=2, label=f'Median: {median_val:.2f}')
    plt.axvline(q1, color='orange', linestyle=':', linewidth=2, label=f'Q1: {q1:.2f}')
    plt.axvline(q3, color='orange', linestyle=':', linewidth=2, label=f'Q3: {q3:.2f}')

    plt.title(title, fontsize=14, fontweight='bold')
    plt.xlabel(column.replace('_', ' ').title())
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

    # Create enhanced histograms
create_enhanced_histogram(fintech_df, 'account_balance', 'Account Balance Distribution with Statistics', 'skyblue')

**Statistical** **summary**

In [None]:
def detailed_statistics(df, column):
    """Generate detailed statistics for a column"""
    stats = {
        'Count': df[column].count(),
        'Mean': df[column].mean(),
        'Median': df[column].median(),
        'Mode': df[column].mode().iloc[0] if len(df[column].mode()) > 0 else 'No mode',
        'Standard Deviation': df[column].std(),
        'Variance': df[column].var(),
        'Minimum': df[column].min(),
        'Maximum': df[column].max(),
        'Range': df[column].max() - df[column].min(),
        'Q1 (25th percentile)': df[column].quantile(0.25),
        'Q3 (75th percentile)': df[column].quantile(0.75),
        'IQR': df[column].quantile(0.75) - df[column].quantile(0.25),
        'Skewness': df[column].skew(),
        'Kurtosis': df[column].kurtosis()
    }
    return stats

# Apply to key columns
print("\n=== DETAILED STATISTICS ===")
print("\nAccount Balance (Fintech):")
balance_stats = detailed_statistics(fintech_df, 'account_balance')
for stat, value in balance_stats.items():
    if isinstance(value, (int, float)):
        print(f"{stat}: {value:.2f}")
    else:
        print(f"{stat}: {value}")

**Data** **Quality** **Assessment**

In [None]:
def assess_data_quality(df, dataset_name):
    print(f"\n{dataset_name} Data Quality Report:")
    print(f"Total rows: {len(df)}")
    print(f"Total columns: {len(df.columns)}")

    # Missing values
    missing_values = df.isnull().sum()
    print(f"Missing values per column:")
    for col, missing in missing_values.items():
        print(f"  {col}: {missing} ({missing/len(df)*100:.1f}%)")

    # Outliers using IQR method
    print(f"\nPotential outliers (using IQR method):")
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        print(f"  {col}: {len(outliers)} outliers ({len(outliers)/len(df)*100:.1f}%)")

assess_data_quality(fintech_df, "FINTECH")