In [None]:
# t-distributed Stochastic Neighbor Embedding
# Change the NAME.csv

import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
try:
    df = pd.read_csv('NAME.csv')
except FileNotFoundError:
    print("Erro: Arquivo 'NAME.csv' não encontrado. Por favor, verifique o nome e o caminho do arquivo.")
    # You might want to handle this error appropriately, e.g., exit or provide instructions.
    exit()


# Assuming the first column is the class label and the rest are features (wavenumbers)
# Separate the class labels and the spectral data
classes = df.iloc[:, 0]
spectra = df.iloc[:, 1:]

# Convert the spectral data to a numpy array
X = spectra.values
y = classes.values

# Perform t-SNE
# Adjust parameters like n_components, perplexity, n_iter as needed
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=300)
X_embedded = tsne.fit_transform(X)

# Plot the results
plt.figure(figsize=(10, 8))

# Get unique classes
unique_classes = np.unique(y)

# Define colors for each class (you can customize this)
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
# Ensure you have enough colors for your classes
if len(unique_classes) > len(colors):
    print(f"Aviso: Existem mais classes ({len(unique_classes)}) do que cores disponíveis ({len(colors)}). Algumas classes terão a mesma cor.")
    # You might want to use a colormap instead for more classes

for i, cls in enumerate(unique_classes):
    # Filter data points belonging to the current class
    indices = y == cls
    plt.scatter(X_embedded[indices, 0], X_embedded[indices, 1], label=f'Classe {cls}', color=colors[i % len(colors)], alpha=0.7)

plt.title('t-SNE of FTIR Spectra')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Classifiaction Report

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Assuming X and y are already loaded and preprocessed from the preceding code

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train a classifier (using RandomForestClassifier as an example)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Plot the classification report as a table
fig, ax = plt.subplots(figsize=(10, len(df_report) * 0.5)) # Adjust figure size based on report size
ax.axis('off')
ax.axis('tight')

# Create the table
table = ax.table(cellText=df_report.values,
                 colLabels=df_report.columns,
                 rowLabels=df_report.index,
                 cellLoc='center',
                 loc='center')

table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.2) # Adjust scale for better readability

plt.title('Classification Report', fontsize=14, y=1.05)
plt.show()

print(f"Accuracy: {accuracy:.4f}")


In [None]:
# Local Interpretable Model-agnostic Explanations

!pip install lime

import lime
import lime.lime_tabular
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming you have a trained model. Let's train a simple RandomForestClassifier for demonstration.
# If you already have a trained model, replace this section with loading your model.

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a classifier (replace with your actual model training)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model (optional)
y_pred = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred)}")

# LIME Explanation
# Create a LIME explainer
# feature_names should correspond to the columns in your spectral data
feature_names = spectra.columns.tolist()
class_names = [str(cls) for cls in unique_classes] # Convert class names to strings

explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train,
    feature_names=feature_names,
    class_names=class_names,
    mode='classification' # Or 'regression' if it's a regression problem
)

# Choose an instance from the test set to explain (e.g., the first instance)
instance_idx = 0
instance_to_explain = X_test[instance_idx]

# Explain the instance
explanation = explainer.explain_instance(
    data_row=instance_to_explain,
    predict_fn=model.predict_proba, # Use predict_proba for classification
    num_features=5 # Number of features to show in the explanation
)

# Visualize the explanation
# In a Jupyter/Colab notebook, this will display an interactive visualization
explanation.show_in_notebook(show_table=True, show_all=False)

# You can also get the explanation as text
# print(explanation.as_text())

# Or as a list of tuples (feature, weight)
# print(explanation.as_list())



In [None]:
# SHapley Additive exPlanations

import numpy as np
!pip install shap

import shap

# SHAP Explanation
# Create a SHAP explainer. For tree-based models, use TreeExplainer.
# For other models (like linear models, SVMs), you might use KernelExplainer or DeepExplainer.
# If you have a different model type, you might need a different explainer.
# Assuming 'model' is a scikit-learn compatible model with a 'predict_proba' or 'predict' method.

# Using the trained RandomForestClassifier from the previous cell
try:
    explainer_shap = shap.TreeExplainer(model)
except Exception as e:
    print(f"Could not create SHAP TreeExplainer. Make sure your model is compatible.")
    print(e)
    # Fallback to KernelExplainer if TreeExplainer fails (more general but slower)
    print("Attempting to use KernelExplainer as a fallback.")
    # KernelExplainer requires a background dataset
    # A common approach is to use a sample of the training data as the background
    background_data = shap.sample(X_train, 100) # Adjust sample size as needed
    explainer_shap = shap.KernelExplainer(model.predict_proba if hasattr(model, 'predict_proba') else model.predict, background_data)


# Calculate SHAP values for the test set
# This can take some time depending on the size of your test set and model complexity
print("Calculating SHAP values...")
shap_values = explainer_shap.shap_values(X_test)
print("SHAP values calculated.")

# Visualize the SHAP results

# If you are working with multi-class classification and predict_proba, shap_values will be a list of arrays, one for each class.
# We need to decide which class's explanation to visualize or aggregate.
# For classification, shap_values[i] corresponds to the SHAP values for the i-th class's output.

# Example 1: Summary Plot (shows the distribution of SHAP values for each feature across the dataset)
# This is useful for understanding the overall impact of features.
# If it's a multi-class problem with predict_proba, you might want to summarize for a specific class or average.
# Let's plot for the first class's output as an example.
# If shap_values is a list of arrays (multi-output), choose one.
if isinstance(shap_values, list):
  print("Generating SHAP summary plot for the first class...")
  # Use shap_values[0] for the first class's SHAP values
  shap.summary_plot(shap_values[0], X_test, feature_names=feature_names)
else:
  print("Generating SHAP summary plot...")
  shap.summary_plot(shap_values, X_test, feature_names=feature_names)


# Example 2: Force Plot (explains a single prediction)
# This shows how features push the prediction from the base value (average prediction) to the explained instance's prediction.
# You need to specify an instance from the test set.
# Again, if multi-output, choose the class index you want to explain.
instance_idx_to_explain_shap = 0 # Same instance as the LIME example

# For multi-class, you often look at the force plot for the predicted class, or the class of interest.
# Let's get the predicted class for the chosen instance.
# model.predict returns an array, get the first element
predicted_class_idx = model.predict([X_test[instance_idx_to_explain_shap]])[0]
# Find the index of this class in the unique_classes list to get the correct SHAP values.
# Ensure unique_classes is a list or handle potential index errors
try:
    predicted_class_shap_index = list(unique_classes).index(predicted_class_idx)
except ValueError:
    print(f"Warning: Predicted class '{predicted_class_idx}' not found in unique classes. Using index 0 for force plot.")
    predicted_class_shap_index = 0


print(f"Generating SHAP force plot for instance {instance_idx_to_explain_shap} (predicted class: {predicted_class_idx})...")
if isinstance(shap_values, list):
    # For multi-output, pass the SHAP values for the relevant class
    # Corrected: Pass the expected value for the predicted class and the SHAP values for the instance and predicted class.
    # The shap_values are typically structured as (n_samples, n_features, n_outputs) for multi-output.
    # We need shap_values[instance_idx, :, class_index] for the instance and class.
    shap.force_plot(explainer_shap.expected_value[predicted_class_shap_index],
                    shap_values[predicted_class_shap_index][instance_idx_to_explain_shap],
                    X_test[instance_idx_to_explain_shap],
                    feature_names=feature_names)
else:
    # For single output
     shap.force_plot(explainer_shap.expected_value,
                    shap_values[instance_idx_to_explain_shap],
                    X_test[instance_idx_to_explain_shap],
                    feature_names=feature_names)


# Example 3: Dependence Plot (shows how a single feature affects the prediction across the dataset)
# This helps visualize the relationship between a feature's value and its impact on the model output.
# You need to specify a feature index or name. Let's pick the feature with the highest mean absolute SHAP value (from the summary plot analysis).
# For multi-class, you might plot this for the SHAP values of a specific class.
# Let's plot for the feature that has the biggest impact on average (based on the summary plot).

if isinstance(shap_values, list):
    # Calculate mean absolute SHAP values across all instances for the first class
    mean_abs_shap_values = np.mean(np.abs(shap_values[0]), axis=0)
    # Find the index of the feature with the highest mean absolute SHAP value
    feature_to_plot_idx = np.argmax(mean_abs_shap_values)
else:
    # Calculate mean absolute SHAP values across all instances
    mean_abs_shap_values = np.mean(np.abs(shap_values), axis=0)
    # Find the index of the feature with the highest mean absolute SHAP value
    feature_to_plot_idx = np.argmax(mean_abs_shap_values)

feature_to_plot_name = feature_names[feature_to_plot_idx]
print(f"Generating SHAP dependence plot for feature: {feature_to_plot_name}")

if isinstance(shap_values, list):
    # For multi-output, plot dependence for the relevant class SHAP values
    # We are plotting dependence for the first class's SHAP values as in the summary plot
    shap.dependence_plot(feature_to_plot_idx, shap_values[0], X_test, feature_names=feature_names, interaction_index=None) # interaction_index=None for no interaction feature
else:
     shap.dependence_plot(feature_to_plot_idx, shap_values, X_test, feature_names=feature_names, interaction_index=None)


# Other SHAP plots include decision plots, waterfall plots, etc. You can explore the SHAP documentation for more options.
# https://shap.readthedocs.io/en/latest/api.html