In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import re

# Read data from Excel file
data = pd.read_excel("IP Dummy data - CompliancePCA.xlsx")

# Clean up column names
data.columns = [re.sub(r'\W+', '', col) for col in data.columns]

# Separate 'Company' column and feature columns
companies = data['Company']
features = data.drop(columns=['Company'])

# Step 1: Standardize the feature data
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

# Step 2: Fit the PCA model
pca = PCA(n_components=features.shape[1])  # Set the number of components equal to the number of features
pca.fit(features_standardized)

# Step 3: Extract the principal components and their corresponding weights
principal_components = pca.components_
explained_variance_ratio = pca.explained_variance_ratio_

# Calculate average weights for each feature
average_weights = np.abs(principal_components).mean(axis=0) * np.sqrt(explained_variance_ratio)

# Normalize weights so that the sum equals 1
normalized_weights = average_weights / np.sum(average_weights)

# Define weights for each variable based on feature importance scores
weights = {}
for i, column in enumerate(features.columns):
    weights[column] = normalized_weights[i]

# Define conditional mappings for compliance scoring based on variable values
conditional_mappings = {
    'SECEnforcementAction': lambda value: 1 if value == 0 else 0,
    'FTCEnforcementAction': lambda value: 1 if value == 0 else 0,
    'AnyDataBreachCasesinlast3years': lambda value: 1 if value == 0 else 0,
    'CompliancewithDataProtectionLaws': lambda value: 0 if value == 0 else 1,
    'ComplianceTrainingProgramsforemployees': lambda value: 0 if value == 0 else 1,
    'IRP': lambda value: 0 if value == 0 else 1,
    'InternalComplianceRiskAssessment': lambda value: 0 if value == 0 else 1,
    'RiskAssessmentbyexternalAuditors': lambda value: 0 if value == 0 else 1
}

# Calculate compliance score for each company
compliance_scores = {}
for i, row in data.iterrows():
    company = row['Company']
    score = 0
    for column in features.columns:
        # Apply conditional mapping to adjust compliance score based on variable value
        score += weights[column] * conditional_mappings[column](row[column])
    # Ensure the score is a single value
    compliance_scores[company] = score

# Reverse and scale the scores
scaled_compliance_scores = {company: (1 - score) * 100 for company, score in compliance_scores.items()}

# Create a DataFrame from the scaled_compliance_scores dictionary
scaled_compliance_scores_df = pd.DataFrame(list(scaled_compliance_scores.items()), columns=['Company', 'Scaled Compliance Score'])

# Save the DataFrame to a CSV file
scaled_compliance_scores_df.to_csv("scaled_compliance_scores_24mar.csv", index=False)

In [2]:
weights

{'SECEnforcementAction': 0.15065539343605033,
 'FTCEnforcementAction': 0.16816785699154507,
 'AnyDataBreachCasesinlast3years': 0.12390027565761219,
 'CompliancewithDataProtectionLaws': 0.14384198330938763,
 'ComplianceTrainingProgramsforemployees': 0.1273298517532375,
 'IRP': 0.11535056680820294,
 'InternalComplianceRiskAssessment': 0.09446446531689394,
 'RiskAssessmentbyexternalAuditors': 0.07628960672707041}