In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#try to find interpretation of the regression results using all the features
MFF_features = pd.read_csv('MFFmatrix-onlyS.csv')
RDkit_features = pd.read_csv('Rdkitmatrix+sym descriptor-only S.csv')

# Initialize the Normalizer
scaler = StandardScaler()
# Then, use the transform method to transform the original dataset
scaled_MFF_features = scaler.fit_transform(MFF_features.values)
scaled_MFF_features = pd.DataFrame(scaled_MFF_features,columns=MFF_features.columns)
scaled_MFF_features.drop(columns=scaled_MFF_features.std()[scaled_MFF_features.std()==0].index,inplace=True)

scaler = StandardScaler()
# Then, use the transform method to transform the original dataset
scaled_RDkit_features = scaler.fit_transform(RDkit_features.values)
scaled_RDkit_features = pd.DataFrame(scaled_RDkit_features,columns=RDkit_features.columns)
scaled_RDkit_features.drop(columns=scaled_RDkit_features.std()[scaled_RDkit_features.std()==0].index,inplace=True)


# Compute the correlation matrix
RDkit_correlation = scaled_RDkit_features.corr()
scaled_RDkit_features.to_csv('scaled_RDkit_features.csv')
# # Check for perfect multicollinearity (correlation of 1 or -1)
# perfect_multicollinearity = RDkit_correlation[(RDkit_correlation == 1) | (RDkit_correlation == -1)]
# print("Perfect Multicollinearity (correlation of 1 or -1):")
# print(perfect_multicollinearity)

# Remove one of the variables in each pair with perfect multicollinearity
to_remove = set()
for column in RDkit_correlation.columns:
    if column not in to_remove:
        # Get indices of columns with perfect multicollinearity with the current column
        perfect_cols = RDkit_correlation.index[(RDkit_correlation[column] == 1) | (RDkit_correlation[column] == -1)].tolist()
        perfect_cols.remove(column)  # Remove the column itself
        # print(perfect_cols)
        to_remove.update(perfect_cols)
print(to_remove)
column_to_keep = [column for column in scaled_RDkit_features.columns if column not in to_remove]
scaled_RDkit_features = scaled_RDkit_features.loc[:, column_to_keep]   
print(scaled_RDkit_features.shape)

RDkit_correlation = scaled_RDkit_features.corr()

# Convert the correlation matrix to a distance matrix
# Distance is computed as 1 - correlation
distance_matrix = 1 - RDkit_correlation.abs()

# Perform hierarchical clustering
# We use 'average' linkage method but you can try 'single', 'complete', etc.
linkage_matrix = linkage(squareform(distance_matrix), method='average')

# Get the order of the features based on hierarchical clustering
dendro = dendrogram(linkage_matrix, no_plot=True)
reordered_RDkit_features = [RDkit_correlation.columns[i] for i in dendro['leaves']]
reordered_RDkit = scaled_RDkit_features[reordered_RDkit_features]

# Reorder the correlation matrix
reordered_RDkit_corr = RDkit_correlation.loc[reordered_RDkit_features, reordered_RDkit_features]

# Plot the reordered correlation matrix
plt.figure(figsize=(60, 60))
sns.heatmap(reordered_RDkit_corr, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix with Rdkitmatrix+sym descriptors Features Grouped by Hierarchical Clustering')
plt.show()

# Compute the correlation matrix
MFF_correlation = scaled_MFF_features.corr()

# Remove one of the variables in each pair with perfect multicollinearity
to_remove = set()
for column in MFF_correlation.columns:
    if column not in to_remove:
        # Get indices of columns with perfect multicollinearity with the current column
        perfect_cols = MFF_correlation.index[(MFF_correlation[column] == 1) | (MFF_correlation[column] == -1)].tolist()
        perfect_cols.remove(column)  # Remove the column itself
        # print(perfect_cols)
        to_remove.update(perfect_cols)
print(to_remove)
column_to_keep = [column for column in scaled_MFF_features.columns if column not in to_remove]
scaled_MFF_features = scaled_MFF_features.loc[:, column_to_keep]   
print(scaled_MFF_features.shape)
scaled_MFF_features.to_csv('scaled_MFF_features.csv')
MFF_correlation = scaled_MFF_features.corr()

# Convert the correlation matrix to a distance matrix
# Distance is computed as 1 - correlation
distance_matrix = 1 - MFF_correlation.abs()

# Perform hierarchical clustering
# We use 'average' linkage method but you can try 'single', 'complete', etc.
linkage_matrix = linkage(squareform(distance_matrix), method='average')

# Get the order of the features based on hierarchical clustering
dendro = dendrogram(linkage_matrix, no_plot=True)
reordered_MFF_features = [MFF_correlation.columns[i] for i in dendro['leaves']]
reordered_MFF = scaled_MFF_features[reordered_MFF_features]

# Reorder the correlation matrix
reordered_MFF_corr = MFF_correlation.loc[reordered_MFF_features, reordered_MFF_features]

# Plot the reordered correlation matrix
plt.figure(figsize=(60, 60))
sns.heatmap(reordered_MFF_corr, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix with MFF Features Grouped by Hierarchical Clustering')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Read two CSV files A and B
A = pd.read_csv('MFFmatrix-onlyS.csv')
B = pd.read_csv('Rdkitmatrix+sym descriptor-only S.csv')

# Calculate the correlation coefficient matrix of A data
A_correlation_matrix = A.corr()

#Print the correlation coefficient matrix of A
print("Correlation Matrix of A:\n", A_correlation_matrix)

# Find the column names with a correlation coefficient of 1 (excluding autocorrelation)
to_remove = set()
for col in A_correlation_matrix.columns:
    for idx in A_correlation_matrix.index:
        if col != idx and A_correlation_matrix.loc[idx, col] == 1:
            if col != idx:  # If the variable name is different
                to_remove.add(col)
                break  # Find one and break out of the loop

# Print the name of the variable to be deleted
print("Variables to remove:\n", to_remove)

# Delete the columns with correlation coefficient 1
A1 = A.drop(columns=to_remove)

# Print updated A data
print("Updated A Data:\n", A1)

# Save the new A1 CSV file
A1.to_csv('A1.csv', index=False)
print("Updated A1 CSV saved to 'A1.csv'")

# Read A1 CSV file
A1 = pd.read_csv('A1.csv')

# Initialize the Normalizer
scaler = StandardScaler()

# Standardized A1 data
A1_scaled = scaler.fit_transform(A1)
A1_scaled = pd.DataFrame(A1_scaled, columns=A1.columns)
A1_scaled.drop(columns=A1_scaled.std()[A1_scaled.std() == 0].index, inplace=True)

# Standardized B data
B_scaled = scaler.fit_transform(B)
B_scaled = pd.DataFrame(B_scaled, columns=B.columns)
B_scaled.drop(columns=B_scaled.std()[B_scaled.std() == 0].index, inplace=True)

# Make sure there are no missing values ​​in the data
A1.dropna(inplace=True)
B_scaled.dropna(inplace=True)

# Initialize an empty DataFrame to store the correlation coefficients
correlation_matrix = pd.DataFrame(index=A1.columns, columns=B_scaled.columns)

#Calculate the correlation coefficient
for a_col in A1.columns:
    for b_col in B_scaled.columns:
        correlation = A1[a_col].corr(B_scaled[b_col])
        correlation_matrix.at[a_col, b_col] = correlation

# Convert the correlation matrix to floating point type
correlation_matrix = correlation_matrix.astype(float)
# Get the row name of the first column and the column name of the first row of a matrix
first_column_row_names = correlation_matrix.index
first_row_column_names = correlation_matrix.columns

# Find duplicate column names
columns_to_drop = [col_name for col_name in first_row_column_names if col_name in first_column_row_names]

# Delete the columns with duplicate column names
correlation_matrix = correlation_matrix.drop(columns=columns_to_drop)

# Delete rows or columns where all the values ​​are NaN
correlation_matrix.dropna(axis=0, how='all', inplace=True)
correlation_matrix.dropna(axis=1, how='all', inplace=True)
# Print the updated correlation matrix
print("Updated Correlation Matrix:\n", correlation_matrix)
print(correlation_matrix.shape)
# Save the updated correlation matrix as a CSV file
correlation_matrix.to_csv('updated_correlation_matrix_between_Original_and_Encoded_Features.csv')
print("Updated correlation matrix saved to 'updated_correlation_matrix_between_Original_and_Encoded_Features.csv'")



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Reading CSV Files
file_path = 'updated_correlation_matrix_between_Original_and_Encoded_Features.csv'
data = pd.read_csv(file_path, index_col=0)
plt.figure(figsize=(300, 200))  # Resize image to fit smaller matrix
ax=sns.heatmap(data, cmap='coolwarm', annot=False,center=0, annot_kws={"size": 100}, cbar=True)
plt.title('Updated Correlation Matrix Heatmap between MFF and Physicochemical properties features', fontsize=40)
plt.xlabel('Physicochemical properties', fontsize=100)
plt.ylabel('MFF', fontsize=100)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=60) # Set the x-axis font size
ax.set_yticklabels(ax.get_yticklabels(), fontsize=80) # Set the y-axis font size
# Set the color bar font size
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=110)  # Adjust the color bar tick font size
#cbar.set_label('Correlation Value', fontsize=100)  # Set the color bar label and its font size
plt.show()

In [None]:
import pandas as pd
import os
from openpyxl import load_workbook
from openpyxl.styles import Alignment

# Read correlation matrix CSV file
correlation_matrix_path = 'updated_correlation_matrix_between_Original_and_Encoded_Features.csv'
updated_correlation_matrix = pd.read_csv(correlation_matrix_path, index_col=0)

def correlate_features(feature_to_search):
    threshold_similar = 0.7
    threshold_explain = 0.4

    similar_features = []
    explain_features = []

    if feature_to_search in reordered_RDkit.columns:
        Similar = reordered_RDkit_corr.loc[feature_to_search, :]
        for rdkit in reordered_RDkit.columns:
            if rdkit != feature_to_search and abs(Similar[rdkit]) > threshold_similar:
                similar_features.append((rdkit, Similar[rdkit]))
        similar_features = sorted(similar_features, key=lambda x: abs(x[1]), reverse=True)
        
        # If no similar features are found, fill in 'None'
        if not similar_features:
            similar_features.append(('None', 'None'))

        if feature_to_search in updated_correlation_matrix.index:
            representation = updated_correlation_matrix.loc[feature_to_search, :]
            for mff in updated_correlation_matrix.columns:
                if pd.notna(representation[mff]) and abs(representation[mff]) > threshold_explain:
                    explain_features.append((mff, representation[mff]))

        if feature_to_search in updated_correlation_matrix.columns:
            representation = updated_correlation_matrix.loc[:, feature_to_search]
            for mff in updated_correlation_matrix.index:
                if pd.notna(representation[mff]) and abs(representation[mff]) > threshold_explain:
                    explain_features.append((mff, representation[mff]))

        explain_features = list(set(explain_features))
        explain_features = sorted(explain_features, key=lambda x: abs(x[1]), reverse=True)
        
        # If no explanatory features are found, fill in 'None'
        if not explain_features:
            explain_features.append(('None', 'None'))

    elif feature_to_search in updated_correlation_matrix.index:
        explain_features = ['MFF descriptor itself']

    return similar_features, explain_features


In [None]:
# Path to the CSV file from which to read the feature_to_search list
features_to_search_path = 'S-all metrics/shap_results-n-PrOH/MLP-top-features.csv'
features_to_search_df = pd.read_csv(features_to_search_path, encoding='utf-8')
# Path to the CSV file where you want to read product information
product_info_path = 'S-all metrics/shap_results-n-PrOH/MLP-features-effects.csv'
product_info_df = pd.read_csv(product_info_path, encoding='utf-8')

# Debug information: Check column names
print("Features to search columns:", features_to_search_df.columns)
print("Product info columns:", product_info_df.columns)

# Preprocessing to ensure that special characters do not affect subsequent operations
features_to_search_df.columns = features_to_search_df.columns.str.replace('/', '_').str.replace(' ', '_')
product_info_df.columns = product_info_df.columns.str.replace('/', '_').str.replace(' ', '_')

# Debug information: Check replaced column names
print("Processed features to search columns:", features_to_search_df.columns)
print("Processed product info columns:", product_info_df.columns)

# Create an empty list to store all the results
results_list = []

# Process each feature_to_search and fill the result list
for index, row in features_to_search_df.iterrows():
    feature_to_search = row[0]
    similar_features, explain_features = correlate_features(feature_to_search)

    product_info = product_info_df.loc[product_info_df['Feature'] == feature_to_search, 'Effect'].values
    product_value = product_info[0] if len(product_info) > 0 else ''

    # Ensure that all entries are tuples containing two values
    similar_features = [item if isinstance(item, tuple) else ('None', 'None') for item in similar_features]
    
    max_len = max(len(similar_features), len(explain_features), 1)
#Change the name here
    temp_df = pd.DataFrame({
        'Descriptor of interest': [feature_to_search] * max_len,
        'Effect on n-PrOH': [product_value] * max_len,
        'Similar Features': [f"{feature}, {correlation}" for feature, correlation in similar_features] + [''] * (max_len - len(similar_features)),
        'MFF': [f"{item[0]}, {item[1]}" if isinstance(item, tuple) else item for item in explain_features] + [''] * (max_len - len(explain_features))
    })

    results_list.append(temp_df)

results_df = pd.concat(results_list, ignore_index=True)

csv_save_dir = '20240611-S-data explanation'
os.makedirs(csv_save_dir, exist_ok=True)

csv_file_path = os.path.join(csv_save_dir, 'n-PrOH-features_analysis_results.csv')
results_df.to_csv(csv_file_path, index=False)

excel_file_path = os.path.join(csv_save_dir, 'n-PrOH-features_analysis_results.xlsx')
results_df.to_excel(excel_file_path, index=False, sheet_name='Analysis Results')

wb = load_workbook(excel_file_path)
ws = wb['Analysis Results']

current_feature = None
start_row = 2

for row in range(2, ws.max_row + 1):
    feature = ws[f'A{row}'].value
    if feature == current_feature:
        continue
    else:
        if current_feature is not None:
            ws.merge_cells(start_row=start_row, start_column=1, end_row=row-1, end_column=1)
            ws[f'A{start_row}'].alignment = Alignment(vertical='center')
        current_feature = feature
        start_row = row

if current_feature is not None:
    ws.merge_cells(start_row=start_row, start_column=1, end_row=ws.max_row, end_column=1)
    ws[f'A{start_row}'].alignment = Alignment(vertical='center')

current_feature = None
current_effect = None
start_row = 2

for row in range(2, ws.max_row + 1):
    effect = ws[f'B{row}'].value
    if effect == current_effect:
        continue
    else:
        if current_effect is not None:
            ws.merge_cells(start_row=start_row, start_column=2, end_row=row-1, end_column=2)
            ws[f'B{start_row}'].alignment = Alignment(vertical='center')
        current_effect = effect
        start_row = row

if current_effect is not None:
    ws.merge_cells(start_row=start_row, start_column=2, end_row=ws.max_row, end_column=2)
    ws[f'B{start_row}'].alignment = Alignment(vertical='center')

wb.save(excel_file_path)
print(f"Results saved to '{csv_file_path}' and '{excel_file_path}'")
print(results_df)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from factor_analyzer import FactorAnalyzer, calculate_kmo, calculate_bartlett_sphericity

# Read the correlation coefficient matrix CSV file
file_path = 'updated_correlation_matrix_between_Original_and_Encoded_Features.csv'
df = pd.read_csv(file_path, index_col=0)

# Preprocess the data to remove rows containing NaN or infinite values
df = df.loc[~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)]

# Transpose the data so that each row is a variable
df_transposed = df.transpose()

# Check the suitability of the data
kmo_all, kmo_model = calculate_kmo(df_transposed)
bartlett_chi_square, bartlett_p_value = calculate_bartlett_sphericity(df_transposed)

print(f"KMO Test: {kmo_model}")
print(f"Bartlett's Test: Chi-square={bartlett_chi_square}, p-value={bartlett_p_value}")

if bartlett_p_value < 0.05:
    print("The data is suitable for factor analysis.")
    
    # Factor Analysis
    Load_Matrix = FactorAnalyzer(n_factors=len(df_transposed.T), rotation=None, method='principal')
    Load_Matrix.fit(df_transposed)
    
    # Draw a scree plot to determine the number of factors
    ev, v = Load_Matrix.get_eigenvalues()
    print('\n Correlation matrix eigenvalues：', ev)
    plt.figure(figsize=(8, 8))
    plt.scatter(range(1, df_transposed.shape[1] + 1), ev)
    plt.plot(range(1, df_transposed.shape[1] + 1), ev)
    plt.title('Changes in eigenvalues ​​and number of factors', fontdict={'weight': 'normal', 'size': 25})
    plt.xlabel('Factors', fontdict={'weight': 'normal', 'size': 15})
    plt.ylabel('Eigenvalues', fontdict={'weight': 'normal', 'size': 15})
    plt.grid()
    plt.show()
    
    # Factor analysis after rotation
    Load_Matrix_rotated = FactorAnalyzer(rotation='varimax', n_factors=18, method='principal')
    Load_Matrix_rotated.fit(df_transposed)
    f_contribution_var_rotated = Load_Matrix_rotated.get_factor_variance()
    matrices_var_rotated = pd.DataFrame()
    matrices_var_rotated["Eigenvalue"] = f_contribution_var_rotated[0]
    matrices_var_rotated["Variance Contribution"] = f_contribution_var_rotated[1]
    matrices_var_rotated["Cumulative contribution of variance"] = f_contribution_var_rotated[2]
    print("Contribution rate of the rotated load matrix")
    print(matrices_var_rotated)
    print("Rotated component matrix")
    print(Load_Matrix_rotated.loadings_)
    
    # Visualizing the factor loading matrix
    Load_Matrix = Load_Matrix_rotated.loadings_
    df_loadings = pd.DataFrame(np.abs(Load_Matrix), index=df_transposed.columns, columns=[f'Factor{i+1}' for i in range(18)])
    
    plt.rcParams['font.family'] = 'Times New Roman' # Set the English font toTimes New Roman 
    plt.figure(figsize=(100, 100))
    ax = sns.heatmap(df_loadings, center=0, annot=True, cmap="coolwarm", cbar=False, fmt=".2f", annot_kws={"size": 30})
    ax.xaxis.set_tick_params(labelsize=40) # Set the x-axis font size
    ax.yaxis.set_tick_params(labelsize=40) # Set the y-axis font size
    plt.title('Factor Loadings Matrix', fontsize=40)
    plt.ylabel('Variables', fontsize=40)# Set the y-axis label
    plt.xlabel('Factors', fontsize=40)
    plt.show()# Show image
    
    # Calculating factor scores
    factor_scores = Load_Matrix_rotated.transform(df_transposed)
    df_scores = pd.DataFrame(factor_scores, index=df_transposed.index, columns=[f'Factor{i+1}' for i in range(18)])
    
    # Printing and saving factor scores
    print("Factor Scores:")
    print(df_scores)
    
    scores_file_path = 'factor_scores.csv'
    df_scores.to_csv(scores_file_path)
    print(f"Factor scores saved to {scores_file_path}")
    # Heatmap to visualize factor scores
    plt.rcParams['font.family'] = 'Times New Roman'  # Set the English font toTimes New Roman 
    plt.figure(figsize=(100, 200))
    ax = sns.heatmap(df_scores, annot=True, center=0, cmap="coolwarm", cbar=False, fmt=".2f", annot_kws={"size": 30})
    plt.title('Factor Scores of molecular physicochemical properties', fontsize=40)
    plt.xlabel('Factors', fontsize=40)
    plt.ylabel('Physicochemical properties', fontsize=40)
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=40) # Set the x-axis font size
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=40) # Set the y-axis font size
    plt.show()  # Show image
else:
    print("The data is not suitable for factor analysis.")




