In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder


In [2]:
# Sample banking customer data
data = {
    'customer_id': [1, 2, 3, 4, 5],
    'firstname': ['Alice', 'Bob', 'Charlie', np.nan, 'Eve'],
    'lastname': ['Smith', 'Johnson', 'Williams', 'Brown', 'Davis'],
    'age': [25, 30, 35, 40, -5],  # Notice invalid age
    'email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'dave@example', 'eve@example.com'],  # Invalid email
    'balance': [1000, 2000, np.nan, 4000, 5000]
}
df = pd.DataFrame(data)

In [3]:
# Define weights for columns based on their importance
weights = {
    'customer_id': 0.1,
    'firstname': 0.3,
    'lastname': 0.2,
    'age': 0.2,
    'email': 0.1,
    'balance': 0.1
}

# Define scoring functions for each dimension
def completeness_score(series):
    return series.notnull().mean()

def consistency_score(series, valid_set=None):
    if valid_set is not None:
        return series.isin(valid_set).mean()
    return 1  # Assume perfect consistency if no valid_set provided

def uniqueness_score(series):
    return series.nunique() / len(series)

def validity_score(series, valid_range=None):
    if valid_range is not None:
        return series.apply(lambda x: valid_range[0] <= x <= valid_range[1] if pd.notnull(x) else False).mean()
    return 1  # Assume perfect validity if no valid_range provided

def accuracy_score(series, reference_series):
    return series.isin(reference_series).mean()


In [4]:




# Compute quality scores for each column
scores = {
    'customer_id': {
        'completeness': completeness_score(df['customer_id']),
        'consistency': consistency_score(df['customer_id']),
        'uniqueness': uniqueness_score(df['customer_id']),
        'validity': validity_score(df['customer_id'])
    },
    'firstname': {
        'completeness': completeness_score(df['firstname']),
        'consistency': consistency_score(df['firstname']),
        'uniqueness': uniqueness_score(df['firstname']),
        'validity': validity_score(df['firstname'])
    },
    'lastname': {
        'completeness': completeness_score(df['lastname']),
        'consistency': consistency_score(df['lastname']),
        'uniqueness': uniqueness_score(df['lastname']),
        'validity': validity_score(df['lastname'])
    },
    'age': {
        'completeness': completeness_score(df['age']),
        'consistency': consistency_score(df['age']),
        'uniqueness': uniqueness_score(df['age']),
        'validity': validity_score(df['age'], valid_range=(0, 120))
    },
    'email': {
        'completeness': completeness_score(df['email']),
        'consistency': consistency_score(df['email'], valid_set=df['email'].str.contains('@')),
        'uniqueness': uniqueness_score(df['email']),
        'validity': validity_score(df['email'])
    },
    'balance': {
        'completeness': completeness_score(df['balance']),
        'consistency': consistency_score(df['balance']),
        'uniqueness': uniqueness_score(df['balance']),
        'validity': validity_score(df['balance'], valid_range=(0, np.inf))
    }
}

# Compute weighted scores for each column
column_scores = {col: np.average(list(scores[col].values()), weights=[0.25, 0.25, 0.25, 0.25]) for col in scores}
print("Column Scores:", column_scores)

# Compute row-level scores
def row_score(row):
    penalties = {
        'firstname': 0.5 if pd.isnull(row['firstname']) else 0,
        'age': 0.3 if row['age'] < 0 or row['age'] > 120 else 0,
        'email': 0.2 if '@' not in row['email'] else 0
    }
    score = 1 - sum(penalties.values())
    return score * np.mean([scores[col]['completeness'] if pd.notnull(row[col]) else 0 for col in row.index])

df['row_score'] = df.apply(row_score, axis=1)
print("Data with Row Scores:\n", df)

# Anomaly Detection using Isolation Forest
encoder = LabelEncoder()
df_encoded = df.apply(lambda x: encoder.fit_transform(x.astype(str)), axis=0)
clf = IsolationForest(contamination=0.1)
df['anomaly_score'] = clf.fit_predict(df_encoded)
print("Data with Anomaly Scores:\n", df)

# Analyze and recommend
def analyze_and_recommend(column_scores, row_scores):
    recommendations = {}
    for col, score in column_scores.items():
        if score < 0.8:
            recommendations[col] = f"Improve {col} data quality. Current score: {score:.2f}"
    
    row_recommendations = df[df['row_score'] < 0.8].index.tolist()
    return recommendations, row_recommendations

recommendations, row_recommendations = analyze_and_recommend(column_scores, df['row_score'])
print("Column Recommendations:", recommendations)
print("Rows Needing Attention:", row_recommendations)


Column Scores: {'customer_id': 1.0, 'firstname': 0.9, 'lastname': 1.0, 'age': 0.95, 'email': 0.75, 'balance': 0.8500000000000001}
Data with Row Scores:
    customer_id firstname  lastname  age                email  balance  \
0            1     Alice     Smith   25    alice@example.com   1000.0   
1            2       Bob   Johnson   30      bob@example.com   2000.0   
2            3   Charlie  Williams   35  charlie@example.com      NaN   
3            4       NaN     Brown   40         dave@example   4000.0   
4            5       Eve     Davis   -5      eve@example.com   5000.0   

   row_score  
0   0.933333  
1   0.933333  
2   0.800000  
3   0.400000  
4   0.653333  
Data with Anomaly Scores:
    customer_id firstname  lastname  age                email  balance  \
0            1     Alice     Smith   25    alice@example.com   1000.0   
1            2       Bob   Johnson   30      bob@example.com   2000.0   
2            3   Charlie  Williams   35  charlie@example.com      NaN   

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder

# Sample expanded banking customer data
data = {
    'customer_id': [1, 2, 3, 4, 5],
    'firstname': ['Alice', 'Bob', 'Charlie', np.nan, 'Eve'],
    'lastname': ['Smith', 'Johnson', 'Williams', 'Brown', 'Davis'],
    'age': [25, 30, 35, 40, -5],  # Notice invalid age
    'email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'dave@example', 'eve@example.com'],  # Invalid email
    'balance': [1000, 2000, np.nan, 4000, 5000],
    'address': ['123 Main St', '456 Elm St', '789 Oak St', '101 Pine St', '202 Maple St'],
    'phone_number': ['555-1234', '555-5678', '555-8765', '555-4321', '555-1111'],
    'account_type': ['savings', 'checking', 'savings', 'checking', 'savings'],
    'account_status': ['active', 'active', 'inactive', 'active', 'inactive'],
    'account_open_date': ['2015-06-01', '2016-07-15', '2017-08-23', '2018-09-12', '2019-10-31'],
    'credit_score': [700, 650, 800, 550, 720],
    'last_login': ['2023-06-01', '2023-06-15', '2023-07-23', '2023-05-12', '2023-05-31'],
    'occupation': ['engineer', 'doctor', 'lawyer', 'teacher', 'artist'],
    'marital_status': ['single', 'married', 'single', 'divorced', 'widowed'],
    'num_dependents': [0, 2, 1, 3, 0],
    'annual_income': [60000, 120000, 90000, 75000, 50000],
    'loan_amount': [np.nan, 15000, 20000, np.nan, 25000],
    'loan_status': ['approved', 'approved', 'rejected', 'approved', 'rejected'],
    'savings_balance': [5000, 2000, 3000, 10000, 500],
    'investment_portfolio': ['stocks', 'bonds', 'mutual funds', 'real estate', 'crypto'],
    'risk_tolerance': ['medium', 'low', 'high', 'medium', 'high']
}
df = pd.DataFrame(data)

# Define weights for columns based on their importance
weights = {
    'customer_id': 0.05,
    'firstname': 0.1,
    'lastname': 0.05,
    'age': 0.05,
    'email': 0.05,
    'balance': 0.05,
    'address': 0.05,
    'phone_number': 0.05,
    'account_type': 0.05,
    'account_status': 0.05,
    'account_open_date': 0.05,
    'credit_score': 0.05,
    'last_login': 0.05,
    'occupation': 0.05,
    'marital_status': 0.05,
    'num_dependents': 0.05,
    'annual_income': 0.05,
    'loan_amount': 0.05,
    'loan_status': 0.05,
    'savings_balance': 0.05,
    'investment_portfolio': 0.05,
    'risk_tolerance': 0.05
}

# Define scoring functions for each dimension
def completeness_score(series):
    return series.notnull().mean()

def consistency_score(series, valid_set=None):
    if valid_set is not None:
        return series.isin(valid_set).mean()
    return 1  # Assume perfect consistency if no valid_set provided

def uniqueness_score(series):
    return series.nunique() / len(series)

def validity_score(series, valid_range=None):
    if valid_range is not None:
        return series.apply(lambda x: valid_range[0] <= x <= valid_range[1] if pd.notnull(x) else False).mean()
    return 1  # Assume perfect validity if no valid_range provided

def accuracy_score(series, reference_series):
    return series.isin(reference_series).mean()

# Compute quality scores for each column
scores = {col: {
    'completeness': completeness_score(df[col]),
    'consistency': consistency_score(df[col]),
    'uniqueness': uniqueness_score(df[col]),
    'validity': validity_score(df[col])  # Add specific validity checks as needed
} for col in df.columns}

# Compute weighted scores for each column
column_scores = {col: np.average(list(scores[col].values()), weights=[0.25, 0.25, 0.25, 0.25]) for col in scores}
print("Column Scores:", column_scores)

# Compute row-level scores with penalties
def row_score(row):
    penalties = {
        'firstname': 0.5 if pd.isnull(row['firstname']) else 0,
        'age': 0.3 if row['age'] < 0 or row['age'] > 120 else 0,
        'email': 0.2 if '@' not in row['email'] else 0
    }
    score = 1 - sum(penalties.values())
    return score * np.mean([scores[col]['completeness'] if pd.notnull(row[col]) else 0 for col in row.index])

df['row_score'] = df.apply(row_score, axis=1)
print("Data with Row Scores:\n", df)

# Anomaly Detection using Isolation Forest
encoder = LabelEncoder()
df_encoded = df.apply(lambda x: encoder.fit_transform(x.astype(str)), axis=0)
clf = IsolationForest(contamination=0.1)
df['anomaly_score'] = clf.fit_predict(df_encoded)
print("Data with Anomaly Scores:\n", df)

# Analyze and recommend
def analyze_and_recommend(column_scores, row_scores):
    recommendations = {}
    for col, score in column_scores.items():
        if score < 0.8:
            recommendations[col] = f"Improve {col} data quality. Current score: {score:.2f}"
    
    row_recommendations = df[df['row_score'] < 0.8].index.tolist()
    return recommendations, row_recommendations

recommendations, row_recommendations = analyze_and_recommend(column_scores, df['row_score'])
print("Column Recommendations:", recommendations)
print("Rows Needing Attention:", row_recommendations)


Column Scores: {'customer_id': 1.0, 'firstname': 0.9, 'lastname': 1.0, 'age': 1.0, 'email': 1.0, 'balance': 0.9, 'address': 1.0, 'phone_number': 1.0, 'account_type': 0.85, 'account_status': 0.85, 'account_open_date': 1.0, 'credit_score': 1.0, 'last_login': 1.0, 'occupation': 1.0, 'marital_status': 0.95, 'num_dependents': 0.95, 'annual_income': 1.0, 'loan_amount': 0.8, 'loan_status': 0.85, 'savings_balance': 1.0, 'investment_portfolio': 1.0, 'risk_tolerance': 0.9}
Data with Row Scores:
    customer_id firstname  lastname  age                email  balance  \
0            1     Alice     Smith   25    alice@example.com   1000.0   
1            2       Bob   Johnson   30      bob@example.com   2000.0   
2            3   Charlie  Williams   35  charlie@example.com      NaN   
3            4       NaN     Brown   40         dave@example   4000.0   
4            5       Eve     Davis   -5      eve@example.com   5000.0   

        address phone_number account_type account_status  ... occupati

In [39]:
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.pdfgen import canvas

def draw_header(c, page_width, page_height, header_color,header_sec_color):
    header_height = 50
    subheader_height = header_height / 10

    # Set the header background color
    c.setFillColor(header_color)
    c.rect(0, page_height - header_height, page_width, header_height, fill=True, stroke=False)

    # Draw the Wells Fargo logo text
    c.setFillColor(colors.white)
    c.setFont("Helvetica-Bold", 24)
    c.drawString(50, page_height - header_height + 10, "Wells Fargo")

    # Set the subheader background color
    c.setFillColor(header_sec_color)
    c.rect(0, page_height - header_height - subheader_height, page_width, subheader_height, fill=True, stroke=False)

def create_pdf(filename):
    # Create a canvas object
    c = canvas.Canvas(filename, pagesize=letter)
    page_width, page_height = letter

    # Add some content to the document
    header_color = colors.red  # Use the Wells Fargo red color for the header
    header_sec_color=colors.yellow
    

    for i in range(1, 11):
        draw_header(c, page_width, page_height, header_color,header_sec_color)
        
        
        
        # Show the page
        c.showPage()

    # Save the PDF
    c.save()

if __name__ == "__main__":
    create_pdf("wells_fargo_navbar.pdf")
