In [6]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import re

# Read data from Excel file
data = pd.read_excel("IT_Sec_Dataset_Dummy vCompany_Level.xlsx")

# Clean up column names
data.columns = [re.sub(r'\W+', '', col) for col in data.columns]

# Separate 'Company' column and feature columns
companies = data['Company']
features = data.drop(columns=['Company'])

# Step 1: Standardize the feature data
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

# Step 2: Fit the PCA model
pca = PCA(n_components=features.shape[1])  # Set the number of components equal to the number of features
pca.fit(features_standardized)

# Step 3: Extract the principal components and their corresponding weights
principal_components = pca.components_
explained_variance_ratio = pca.explained_variance_ratio_

# Calculate average weights for each feature
average_weights = np.abs(principal_components).mean(axis=0) * np.sqrt(explained_variance_ratio)

# Normalize weights so that the sum equals 1
normalized_weights = average_weights / np.sum(average_weights)

# Define weights for each variable based on feature importance scores
weights = {}
for i, column in enumerate(features.columns):
    weights[column] = normalized_weights[i]

# Define conditional mappings for compliance scoring based on variable values
conditional_mappings = {
    'AssetManagementProcess': lambda value: 0 if value == 0 else 1,
    'Removablemediapolicy': lambda value: 0 if value == 0 else 1,
    'AccessControlPolicy': lambda value: 0 if value == 0 else 1,
    'WirelessNetworkSecurityProtocol': lambda value: 0 if value == 0 else 1,
    'AutomatedMalwareMonitoring': lambda value: 0 if value == 0 else 1,
    'DataExfiltrationpreventionprocess': lambda value: 0 if value == 0 else 1,
    'Remotenetworkaccesscontrol': lambda value: 0 if value == 0 else 1,
}

# Calculate ITsec score for each company
ITsec_scores = {}
for i, row in data.iterrows():
    company = row['Company']
    score = 0
    for column in features.columns:
        # Apply conditional mapping to adjust compliance score based on variable value
        score += weights[column] * conditional_mappings[column](row[column])
    # Ensure the score is a single value
    ITsec_scores[company] = score

# Reverse and scale the scores
scaled_ITsec_scores = {company: (1 - score) * 100 for company, score in ITsec_scores.items()}

# Create a DataFrame from the scaled_ITsec_scores dictionary
scaled_ITsec_scores_df = pd.DataFrame(list(scaled_ITsec_scores.items()), columns=['Company', 'Scaled ITsec Score'])

# Save the DataFrame to a CSV file
scaled_ITsec_scores_df.to_csv("scaled_ITsec_scores_24mar.csv", index=False)

In [7]:
weights

{'AssetManagementProcess': 0.18504185691230945,
 'Removablemediapolicy': 0.1744800197736679,
 'AccessControlPolicy': 0.15310257493671128,
 'WirelessNetworkSecurityProtocol': 0.14409323946144964,
 'AutomatedMalwareMonitoring': 0.12620792135754835,
 'DataExfiltrationpreventionprocess': 0.11974925990189103,
 'Remotenetworkaccesscontrol': 0.09732512765642234}