In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score, confusion_matrix

# Regressão / Regression

## Com categoria / with label

In [None]:
# Load the dataset
try:
    df = pd.read_csv('NAME.csv')
    print("CSV loaded successfully.")
    display(df.head())
except FileNotFoundError:
    print("Error: NAME.csv not found. Please make sure the file is in the correct directory.")
    # Removed !ls -F from print statement, can be run separately if needed.
    print("You can use `!ls -F` in a new cell to see available files if needed.")
    df = None

In [None]:
if df is not None:
    # Separate features (X) and target (y)
    # The first column is categorical and should be excluded from features.
    y = df.iloc[:, 5]  # Target
    X = df.iloc[:, 2:] # Features are from the third column onwards.

    print(f"Shape of features (X): {X.shape}")
    print(f"Shape of target (y): {y.shape}")

    # Display first few rows of X and y
    print("Features (X) head:")
    display(X.head())
    print("Target (y) head:")
    display(y.head())

In [None]:
if df is not None:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print(f"Shape of scaled training features: {X_train_scaled.shape}")
    print(f"Shape of scaled testing features: {X_test_scaled.shape}")

In [None]:
if df is not None:
    # Initialize and train the PLS model
    # n_components is the number of latent variables to extract
    pls = PLSRegression(n_components=2)
    pls.fit(X_train_scaled, y_train)

    print("PLS Regression model trained successfully.")

In [None]:
if df is not None:
    # Make predictions on the test set
    y_pred = pls.predict(X_test_scaled)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mse)

    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

    # Plotting actual vs. predicted values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Diagonal line
    plt.xlabel('Actual Class Values')
    plt.ylabel('Predicted Class Values')
    plt.title('PLS Regression: Actual vs. Predicted Values')
    plt.grid(True)
    plt.show()

In [None]:
if df is not None and 'pls' in locals():
    # Get PLS weights and Y scores
    x_weights = pls.x_weights_  # Shape: (n_features, n_components)
    y_scores = pls.y_scores_    # Shape: (n_samples, n_components)

    # Calculate the sum of squares of Y scores for each component
    ss_y_scores = np.sum(y_scores**2, axis=0) # Shape: (n_components,)

    # Calculate VIP scores
    # p is the number of features
    p = X.shape[1]
    # Calculate the numerator part: sum_a (w_ja^2 * SS_a)
    numerator = np.sum(x_weights**2 * ss_y_scores, axis=1)
    # Calculate the denominator part: sum_a (SS_a)
    denominator = np.sum(ss_y_scores)

    # Final VIP calculation
    vip_scores = np.sqrt(p * numerator / denominator)

    # Create a DataFrame for better visualization
    vip_df = pd.DataFrame({
        'Variable': X.columns,
        'VIP Score': vip_scores
    })

    # Sort by VIP Score in descending order
    vip_df = vip_df.sort_values(by='VIP Score', ascending=False).reset_index(drop=True)

    print("\nVariable Importance in Projection (VIP Scores):")
    display(vip_df)
else:
    print("PLS model not found or data not loaded. Please ensure previous cells ran successfully.")

## Sem categoria

In [None]:
# Load the dataset
try:
    df = pd.read_csv('NAME.csv', sep=';')
    print("CSV loaded successfully.")
    display(df.head())
except FileNotFoundError:
    print("Error: NAME.csv not found. Please make sure the file is in the correct directory.")
    # Removed !ls -F from print statement, can be run separately if needed.
    print("You can use `!ls -F` in a new cell to see available files if needed.")
    df = None

In [None]:
if df is not None:
    # Separate features (X) and target (y)
    # The first column is now assumed to be the class/target variable
    y = df.iloc[:, 4]  # First column
    X = df.iloc[:, 1:] # Remaining columns are features (from second column onwards)

    print(f"Shape of features (X): {X.shape}")
    print(f"Shape of target (y): {y.shape}")

    # Display first few rows of X and y
    print("Features (X) head:")
    display(X.head())
    print("Target (y) head:")
    display(y.head())

In [None]:
if df is not None:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print(f"Shape of scaled training features: {X_train_scaled.shape}")
    print(f"Shape of scaled testing features: {X_test_scaled.shape}")

In [None]:
if df is not None:
    # Initialize and train the PLS model
    # n_components is the number of latent variables to extract
    pls = PLSRegression(n_components=5)
    pls.fit(X_train_scaled, y_train)

    print("PLS Regression model trained successfully.")

In [None]:
if df is not None:
    # Make predictions on the test set
    y_pred = pls.predict(X_test_scaled)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mse)

    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

    # Plotting actual vs. predicted values
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Diagonal line
    plt.xlabel('Actual Class Values')
    plt.ylabel('Predicted Class Values')
    plt.title('PLS Regression: Actual vs. Predicted Values')
    plt.grid(True)
    plt.show()

In [None]:
if df is not None and 'pls' in locals():
    # Get PLS weights and Y scores
    x_weights = pls.x_weights_  # Shape: (n_features, n_components)
    y_scores = pls.y_scores_    # Shape: (n_samples, n_components)

    # Calculate the sum of squares of Y scores for each component
    ss_y_scores = np.sum(y_scores**2, axis=0) # Shape: (n_components,)

    # Calculate VIP scores
    # p is the number of features
    p = X.shape[1]
    # Calculate the numerator part: sum_a (w_ja^2 * SS_a)
    numerator = np.sum(x_weights**2 * ss_y_scores, axis=1)
    # Calculate the denominator part: sum_a (SS_a)
    denominator = np.sum(ss_y_scores)

    # Final VIP calculation
    vip_scores = np.sqrt(p * numerator / denominator)

    # Create a DataFrame for better visualization
    vip_df = pd.DataFrame({
        'Variable': X.columns,
        'VIP Score': vip_scores
    })

    # Sort by VIP Score in descending order
    vip_df = vip_df.sort_values(by='VIP Score', ascending=False).reset_index(drop=True)

    print("\nVariable Importance in Projection (VIP Scores):")
    display(vip_df)
else:
    print("PLS model not found or data not loaded. Please ensure previous cells ran successfully.")

# Discriminant Analysis

In [None]:
# ==========================
# 1️⃣ Carregar dados / load data
# ==========================
df = pd.read_csv("NAME.csv")
display(df.head())

# Primeira coluna é o target / first column is target
y_raw = df.iloc[:, 0]
X = df.iloc[:, 1:]

In [None]:
# ==========================
# 2️⃣ Codificar variável alvo / Encode target variable
# ==========================
label_enc = LabelEncoder()
y_encoded = label_enc.fit_transform(y_raw)

onehot = OneHotEncoder(sparse_output=False)
y_onehot = onehot.fit_transform(y_encoded.reshape(-1, 1))

# ==========================
# 3️⃣ Separar treino e teste / Split test and train
# ==========================
X_train, X_test, y_train, y_test, y_encoded_train, y_encoded_test = train_test_split(
    X, y_onehot, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print(f"Shape of scaled training features: {X_train.shape}")
print(f"Shape of scaled testing features: {X_test.shape}")

In [None]:
# ==========================
# 4️⃣ Padronização + PLS / Standardization + PLS
# ==========================
n_comp = 2 # number of labels/categories / número de variáveis/categorias

# Initialize and fit the scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Initialize and fit the PLS model
pls_da_model = PLSRegression(n_components=n_comp)
pls_da_model.fit(X_train_scaled, y_train)

print("PLS model trained successfully.")

In [None]:
# ==========================
# 5️⃣ Predição / Prediction
# ==========================
# Scale the test features
X_test_scaled = scaler.transform(X_test)

y_pred_continuous = pls_da_model.predict(X_test_scaled)

y_pred_class = np.argmax(y_pred_continuous, axis=1)
y_test_class = np.argmax(y_test, axis=1)

# ==========================
# 6️⃣ Métricas / Metric
# ==========================
print("Accuracy:", accuracy_score(y_test_class, y_pred_class))
print("\nClassification report:\n")
print(classification_report(
    y_test_class,
    y_pred_class,
    target_names=label_enc.classes_
))

mse = mean_squared_error(y_test, y_pred_continuous)
r2 = r2_score(y_test, y_pred_continuous)

print("\nMSE:", mse)
print("R2:", r2)

# ==========================
# 7️⃣ Q²
# ==========================
cv = KFold(n_splits=5, shuffle=True, random_state=42)

y_cv_pred = cross_val_predict(pls_da_model, X_train_scaled, y_train, cv=cv) # Changed pls_da to pls_da_model

# Use y_train for Q2 calculation as y_cv_pred is based on the training set
press = np.sum((y_train - y_cv_pred) ** 2)
tss = np.sum((y_train - np.mean(y_train, axis=0)) ** 2)

q2 = 1 - (press / tss)

print("Q2:", q2)

# ==========================
# 8️⃣ Matriz de Confusão / Confusion matrix
# ==========================
cm = confusion_matrix(y_test_class, y_pred_class)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=label_enc.classes_,
            yticklabels=label_enc.classes_,
            cmap="Blues")

plt.xlabel("Predict")
plt.ylabel("Real")
plt.title("Confusion Matrix - PLS-DA")
plt.tight_layout()
plt.show()

In [None]:
# ==========================
# 9️⃣ Score Plot (Comp1 vs Comp2)
# ==========================

X_train_scaled = scaler.transform(X_train)
scores = pls_da_model.transform(X_train_scaled)

plt.figure(figsize=(7, 6))

for i, species in enumerate(label_enc.classes_):
    plt.scatter(scores[y_encoded_train == i, 0],
                scores[y_encoded_train == i, 1],
                label=species,
                alpha=0.7)

plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title("Score Plot PLS-DA")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# ==========================
# 10. VIP Scores
# ==========================
if 'pls_da_model' in locals():
    # Get PLS weights and Y scores
    # x_weights_ are the weights for the X variables
    x_weights = pls_da_model.x_weights_  # Shape: (n_features, n_components)
    # y_scores_ are the scores for the Y variables (target)
    y_scores = pls_da_model.y_scores_    # Shape: (n_samples, n_components)

    # Calculate the sum of squares of Y scores for each component
    ss_y_scores = np.sum(y_scores**2, axis=0) # Shape: (n_components,)

    # Calculate VIP scores
    p = X.shape[1] # Number of features

    # Calculate the numerator part: sum_a (w_ja^2 * SS_a)
    numerator = np.sum(x_weights**2 * ss_y_scores, axis=1)
    # Calculate the denominator part: sum_a (SS_a)
    denominator = np.sum(ss_y_scores)

    # Final VIP calculation
    vip_scores = np.sqrt(p * numerator / denominator)

    # Create a DataFrame for better visualization
    vip_df = pd.DataFrame({
        'Variable': X.columns,
        'VIP Score': vip_scores
    })

    # Sort by VIP Score in descending order
    vip_df = vip_df.sort_values(by='VIP Score', ascending=False).reset_index(drop=True)

    print("\nVariable Importance in Projection (VIP Scores) for PLS-DA:")
    display(vip_df)

    # Save the VIP scores to a CSV file
    vip_df.to_csv('VIP.csv', index=False)
    print("VIP scores saved to VIP.csv")
else:
    print("PLS-DA model (pls_da_model) not found. Please ensure previous cells ran successfully.")