In [None]:
# PCA
# Change the NAME.csv

import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('NAME.csv')

# Assuming the first column is the class and the rest are the spectral data
# Extract the features (spectral data) and the target (class)
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

# Perform PCA
# We'll reduce the dimensionality to 2 components for visualization
n_components = 2
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

# Create a DataFrame with the PCA results
pca_df = pd.DataFrame(data=X_pca, columns=[f'Principal Component {i+1}' for i in range(n_components)])
pca_df['class'] = y

# Visualize the PCA results
plt.figure(figsize=(8, 6))
classes = y.unique()
colors = ['r', 'g', 'b'] # You can add more colors if you have more classes

for i, class_name in enumerate(classes):
    plt.scatter(pca_df.loc[pca_df['class'] == class_name, 'Principal Component 1'],
                pca_df.loc[pca_df['class'] == class_name, 'Principal Component 2'],
                c=colors[i],
                label=class_name)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of FTIR Spectra')
plt.legend()
plt.grid(True)
plt.show()

# Optional: Display the explained variance ratio
print(f'Explained variance ratio by component: {pca.explained_variance_ratio_}')
print(f'Total explained variance: {sum(pca.explained_variance_ratio_)}')


In [None]:
# Confusion Matrix

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a simple classifier (e.g., Random Forest) for demonstration
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Classifiaction Report

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import numpy as np

# Generate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Prepare data for the table
metrics = ['precision', 'recall', 'f1-score', 'support']
classes = list(report.keys())[:-3]  # Get class names, excluding accuracy, macro avg, weighted avg

data = []
for cls in classes:
    row = [cls]
    for metric in metrics:
        row.append(f"{report[cls][metric]:.4f}")
    data.append(row)

# Add accuracy, macro avg, weighted avg
accuracy_row = ['accuracy', '', '', f"{report['accuracy']:.4f}", f"{report['accuracy']:.0f}"] # Support for accuracy is the total number of samples
macro_avg_row = ['macro avg']
for metric in metrics[:-1]: # Exclude support for macro avg
    macro_avg_row.append(f"{report['macro avg'][metric]:.4f}")
macro_avg_row.append(f"{report['macro avg']['support']:.0f}")

weighted_avg_row = ['weighted avg']
for metric in metrics[:-1]: # Exclude support for weighted avg
    weighted_avg_row.append(f"{report['weighted avg'][metric]:.4f}")
weighted_avg_row.append(f"{report['weighted avg']['support']:.0f}")


data.append(macro_avg_row)
data.append(weighted_avg_row)
data.append(accuracy_row)


# Create a DataFrame for the table
df_report = pd.DataFrame(data, columns=['class'] + metrics)

# Create the figure and axis
fig, ax = plt.subplots(figsize=(8, 4))  # Adjust size as needed

# Hide the axes
ax.axis('off')
ax.axis('tight')

# Create the table
table = ax.table(cellText=df_report.values,
                 colLabels=df_report.columns,
                 cellLoc = 'center',
                 loc='center')

# Style the table (optional)
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.2) # Adjust scale as needed

# Set title
ax.set_title('Classification Report', fontsize=14)

# Display the table
plt.show()



In [None]:
# contribution of each variable (Cos2).

import matplotlib.pyplot as plt
import numpy as np # Move the import statement to the top

# Calculate the squared cosines (Cos2) for each variable
# Cos2 of a variable i for a principal component j is (loading_ij)^2
# Loadings are the eigenvectors scaled by the square root of the eigenvalues

# The loadings are the components attribute of the PCA object
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

# Calculate squared cosines
cos2 = loadings**2

# Sum of squared cosines for each variable across all principal components
total_cos2_per_variable = np.sum(cos2, axis=1)

# Normalize the Cos2 values to show the contribution to the explained variance
# This can be interpreted as the proportion of variance of each variable that is captured by the selected principal components.
# Alternatively, you might want to show the contribution of each variable to the total inertia explained by the selected components.
# A common approach is to look at the contribution of each variable to each principal component: (cos2_ij / eigenvalue_j) * 100
# Let's plot the total Cos2 for each variable as a measure of its representation in the principal components.


plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), total_cos2_per_variable)
plt.xlabel('Variable Index')
plt.ylabel('Total Cos2 (Contribution to PCs)')
plt.title('Contribution of Each Variable to the Principal Components')
plt.xticks(range(X.shape[1]), rotation=90)
plt.grid(axis='y')
plt.show()

# You can also show the contribution of each variable to the first two principal components separately
plt.figure(figsize=(12, 6))
plt.bar(range(X.shape[1]), cos2[:, 0], label='PC 1')
plt.bar(range(X.shape[1]), cos2[:, 1], bottom=cos2[:, 0], label='PC 2')
plt.xlabel('Variable Index')
plt.ylabel('Cos2 (Contribution)')
plt.title('Contribution of Each Variable to PC1 and PC2')
plt.xticks(range(X.shape[1]), rotation=90)
plt.legend()
plt.grid(axis='y')
plt.show()

In [None]:
# SHapley Additive exPlanations

!pip install shap

import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Assuming you have a trained model. Let's train a simple classifier as an example.
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Create an explainer object.
# For tree models, shap.TreeExplainer is recommended.
explainer = shap.TreeExplainer(model)

# Calculate SHAP values for the test set.
shap_values = explainer.shap_values(X_test)

# Assuming a multi-class classification, shap_values will be a list of arrays,
# one for each class. Let's pick the SHAP values for the first class as an example.
# You might need to choose the class that is of most interest for your analysis.
# shap_values_class_0 = shap_values[0]

# Summarize the effects of all the features.
# This plot shows the importance of each feature.
shap.summary_plot(shap_values, X_test)

# Another way to summarize is using a bar plot.
# This plot shows the average absolute SHAP value for each feature.
# If you have multi-class output, you might need to specify which class you are interested in,
# or use a different visualization type suitable for multi-class.
# shap.summary_plot(shap_values, X_test, plot_type="bar")

# Visualize the SHAP values for a single prediction.
# Choose an instance from the test set (e.g., the first instance).
# instance_index = 0
# shap.initjs() # Initialize JavaScript for interactive plots
# shap.force_plot(explainer.expected_value[0], shap_values_class_0[instance_index,:], X_test.iloc[instance_index,:])

# To visualize contributions for a specific class prediction (e.g., class 0):
# shap.force_plot(explainer.expected_value[0], shap_values[0][instance_index,:], X_test.iloc[instance_index,:])

# For multi-class, you can visualize the force plot for a specific class or a combination.
# Here's an example for the first instance, visualizing contributions to the prediction of class 0
# shap.initjs()
# shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:])

# You can also visualize the dependence of a feature's effect on its value.
# Choose a feature to plot (e.g., the first feature).
# feature_index_to_plot = 0
# shap.dependence_plot(feature_index_to_plot, shap_values[0], X_test)

# If you have multi-class, you might need to specify the class
# shap.dependence_plot(feature_index_to_plot, shap_values[0], X_test, interaction_index=None) # for class 0

# You might want to explore different SHAP plots based on your specific model and analysis goals.
# The interpretation of SHAP values can vary slightly depending on the model type and the explainer used.
# Refer to the SHAP documentation for more details on different plot types and interpretations:
# https://shap.readthedocs.io/en/latest/api.html#plots


In [None]:
# Local Interpretable Model-agnostic Explanations

!pip install lime

import lime
import lime.lime_tabular

# Assuming you have a trained model.
# We already have a RandomForestClassifier trained as `model`.

# Create a LIME explainer
# We need to provide the training data, the feature names, and the class names.
# The mode is 'classification' for classification problems.
# The discretize_continuous flag is important for LIME.
explainer_lime = lime.lime_tabular.LimeTabularExplainer(training_data=X_train.values,
                                                        feature_names=X_train.columns.tolist(),
                                                        class_names=model.classes_.tolist(),
                                                        mode='classification',
                                                        discretize_continuous=True)

# Explain a single instance
# Choose an instance from the test set to explain (e.g., the first instance).
instance_to_explain_index = 0
instance_to_explain = X_test.iloc[instance_to_explain_index]

# Get the explanation for the instance.
# num_features controls how many features are included in the explanation.
explanation = explainer_lime.explain_instance(data_row=instance_to_explain.values,
                                              predict_fn=model.predict_proba,
                                              num_features=5)

# Print the explanation.
print(f"Explanation for instance {instance_to_explain_index}:")
print(explanation.as_list())

# Visualize the explanation
# You can also save the explanation as an HTML file.
explanation.show_in_notebook(show_table=True, show_all=False)

# You can get the list of (feature, weight) tuples
# lime_features = explanation.as_list()

# For a specific prediction (e.g., the predicted class):
predicted_class_index = model.predict(instance_to_explain.values.reshape(1, -1))[0]
# If your target is not integer encoded, you might need to map the index back to the class name
# predicted_class_name = model.classes_[predicted_class_index]

# To get explanation for a specific class prediction:
# explanation_for_class = explainer_lime.explain_instance(data_row=instance_to_explain.values,
#                                                          predict_fn=model.predict_proba,
#                                                          num_features=5,
#                                                          top_labels=None, # Set to 1 to explain the top predicted class only
#                                                          labels=(predicted_class_index,)) # Specify the label you want to explain

# explanation_for_class.show_in_notebook(show_table=True, show_all=False)

# LIME provides local explanations for individual predictions.
# The interpretation relies on understanding how perturbing the feature values around the instance
# affects the model's prediction. The explanation highlights which features are most influential
# in predicting the specific outcome for that specific instance.

In [None]:
# Other PCs combination.

import pandas as pd
import matplotlib.pyplot as plt
# Assuming X has been defined and PCA has been performed
# Update n_components to include the desired number of principal components
n_components = 4  # For PC1, PC2, PC3, PC4
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

# Create a DataFrame with the PCA results
pca_df = pd.DataFrame(data=X_pca, columns=[f'PC{i+1}' for i in range(n_components)])
pca_df['class'] = y

# Get unique classes and assign colors
classes = y.unique()
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k'][:len(classes)] # Ensure enough colors

# Create a figure with subplots for all combinations of PCs
fig, axes = plt.subplots(n_components, n_components, figsize=(20, 20)) # Adjust figsize as needed

for i in range(n_components):
    for j in range(n_components):
        ax = axes[i, j]

        # Plot the scatter plot for each class
        for k, class_name in enumerate(classes):
            ax.scatter(pca_df.loc[pca_df['class'] == class_name, f'PC{j+1}'],
                       pca_df.loc[pca_df['class'] == class_name, f'PC{i+1}'],
                       c=colors[k],
                       label=class_name,
                       alpha=0.6, s=50) # Add some transparency and size for better visualization

        # Set labels and title for each subplot
        ax.set_xlabel(f'PC{j+1}')
        ax.set_ylabel(f'PC{i+1}')
        ax.set_title(f'PC{j+1} vs PC{i+1}')

        # Add legend to one of the subplots (to avoid repetition)
        if i == 0 and j == n_components - 1:
             ax.legend(title='Class', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout to prevent overlapping titles and labels
plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust rect to make space for the legend

# Show the plot
plt.show()

# Optional: Display the explained variance ratio
print(f'Explained variance ratio by component: {pca.explained_variance_ratio_}')
print(f'Total explained variance: {sum(pca.explained_variance_ratio_)}')

