## Classification
## 0 -> Unhealthy

## 1 -> Healthy

In [1]:
import sys
from pathlib import Path
# Get the repository root (2 levels up from current notebook)
repo_root = Path(__file__).parent.parent.parent if '__file__' in globals() else Path.cwd().parent.parent
sys.path.insert(0, str(repo_root))

# import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, mean_absolute_error, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
import Template
# DefiniciÃ³n de modulos
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling  import SMOTE, ADASYN
from sklearn.decomposition import PCA
from Mod_Productor2 import Mod_Productor as MP
# Instantiate the Template class


Template = Template.Template()

# Build the path to the Excel file in the repository root
data_path = os.path.join(repo_root, 'Datos1_InteraccionesNIR.xlsx')
# import the data from all sheets of the Excel file


df0, df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15 = [
    pd.read_excel(data_path, sheet_name=i) for i in range(16)
]

In [2]:
# Lets add the column 'Sana' to all dataframes except df0
for i, df in enumerate([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15], start=1):
    df.insert(1, 'Sana', df['Tratamiento'].apply(lambda x: 1 if x == 'Control' else 0))
    df.drop(columns=['Tratamiento'], inplace=True)
    # Now if the dataframe have the column 'Planta' delete it
    if 'Planta' in df.columns:
        df.drop(columns=['Planta'], inplace=True)


# lets take 2 samples of each dataframe to use as test data later and put them in the same csv file 
test_samples = []
for i, df in enumerate([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15], start=1):
    test_sample = df.sample(n=10, random_state=42)
    test_sample.insert(0, 'Day', i)  # Add a column to identify from which sheet they come from
    test_samples.append(test_sample)


# Save all test samples to one CSV file
pd.concat(test_samples, ignore_index=True).to_csv('test_sample_df.csv', index=False)
print("All test samples saved to 'test_sample_df.csv'")

# now remove the sampled rows from the original dataframes taking care to remove the 'Day' column first
for i, (df, test_sample) in enumerate(zip([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15], 
                                            test_samples), start=1):
    df_sample = test_sample.drop(columns=['Day'])
    df.drop(df_sample.index, inplace=True)
    


All test samples saved to 'test_sample_df.csv'


In [3]:
# Produce the models for each dataframe
models = {}
for i, df in enumerate([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15], start=1):
    print(f"Producing model for DataFrame df{i}...")
    modelo = MP(df, target = 'Sana', test_size=0.2, random_state=42, PCA_components=2)
    modelo.train_model()
    models[f'model_df{i}'] = modelo

Producing model for DataFrame df1...
Original feature count: 2151
Features after PCA: 2
Explained variance ratio: [0.61622955 0.22070541]
Total explained variance: 0.8369

Class distribution before SMOTE:
  Class 0 (Unhealthy): 202
  Class 1 (Healthy): 27

Class distribution after SMOTE:
  Class 0 (Unhealthy): 202
  Class 1 (Healthy): 202

Starting GridSearchCV...
Fitting 5 folds for each of 36 candidates, totalling 180 fits

TRAINING COMPLETE
Best parameters found:  {'C': 10, 'class_weight': None, 'penalty': 'l1', 'solver': 'saga'}

Best scores across all metrics:
  Accuracy: 0.8934
  Precision: 0.8759
  Recall: 0.9261
  F1-Score: 0.8977
  ROC-AUC: 0.9262
Producing model for DataFrame df2...
Original feature count: 2151
Features after PCA: 2
Explained variance ratio: [0.8471026  0.10813347]
Total explained variance: 0.9552

Class distribution before SMOTE:
  Class 0 (Unhealthy): 197
  Class 1 (Healthy): 26

Class distribution after SMOTE:
  Class 0 (Unhealthy): 197
  Class 1 (Healthy)

## Big Model

In [4]:
# First make a prediction on the test samples with ALL models and put the results in a csv file

# Concatenate all test samples first
df_predictions = pd.concat(test_samples, ignore_index=True)

# For each model, make predictions on ALL test samples
for model_num in range(1, 16):
    modelo = models[f'model_df{model_num}']
    
    # Prepare all test data (removing 'Sana' and 'Day' columns, keeping only wavelength features)
    feature_columns = [col for col in df_predictions.columns if col not in ['Sana', 'Day'] and not str(col).startswith('pred_')]
    X_test_all = df_predictions[feature_columns].values
    
    # Make predictions with this model on all test samples
    y_pred = modelo.model.predict(X_test_all)
    
    # Add predictions as a new column
    df_predictions[f'pred_{model_num}'] = y_pred
    print(f"Model {model_num} predictions added to column 'pred_{model_num}'")

# Display the first few rows
print("\nFirst few rows of predictions:")
print(df_predictions.head())

# Save the predictions to a CSV file
df_predictions.to_csv('predictions.csv', index=False)
print("\nPredictions saved to 'predictions.csv'")

# Show column names to verify
print("\nColumns in df_predictions:")
print(df_predictions.columns.tolist())

ValueError: X has 2151 features, but LogisticRegression is expecting 2 features as input.

In [None]:
# Drop the columns of the wavelength features
df_predictions.drop(columns=feature_columns, inplace=True)

In [None]:
df_predictions.head(10)


In [None]:
# Confussion matrix for each model
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score
for model_num in range(1, 16):
    y_true = df_predictions['Sana'].values
    y_pred = df_predictions[f'pred_{model_num}'].values
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    # plot confusion matrix with title corresponding to model number
    disp = ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap=plt.cm.Blues)
    # Set title and add accuracy in the title
    disp.ax_.set_title(f'Confusion Matrix for Model {model_num}, Accuracy: {accuracy:.2f}')
    plt.show()

### Logistic Regression with weighted features from base models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Prepare data for meta-model
X_meta = df_predictions[[f'pred_{i}' for i in range(1, 16)]].values
y_meta = df_predictions['Sana'].values

# Create weights: higher day number = higher weight
# For example: day 15 has weight 15, day 1 has weight 1
weights = np.array([i for i in range(1, 16)])

# Apply weights to each feature (multiply each column by its weight)
X_meta_weighted = X_meta * weights

# Split data
X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(
    X_meta_weighted, y_meta, test_size=0.2, random_state=42, stratify=y_meta
)

# Train meta-model
meta_model = LogisticRegression(max_iter=1000, random_state=42)
meta_model.fit(X_train_meta, y_train_meta)

# Evaluate
y_pred_meta = meta_model.predict(X_test_meta)
print(f"Meta-Model Accuracy: {accuracy_score(y_test_meta, y_pred_meta):.4f}")
print("\nClassification Report:")
print(classification_report(y_test_meta, y_pred_meta))

# Show feature importance (coefficients)
print("\nModel Coefficients (after weighting):")
for i, coef in enumerate(meta_model.coef_[0], 1):
    print(f"  Model {i}: {coef:.4f}")

# Confusion matrix for meta-model
disp = ConfusionMatrixDisplay.from_predictions(y_test_meta, y_pred_meta, cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion Matrix for Meta-Model')
plt.show()

### Neural Network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Define a simple neural network
class MetaClassifier(nn.Module):
    def __init__(self, input_size=15, hidden_size=32):
        super(MetaClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(hidden_size // 2, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# Prepare data for neural network - NO WEIGHTS APPLIED
X_meta = df_predictions[[f'pred_{i}' for i in range(1, 16)]].values
y_meta = df_predictions['Sana'].values

# Split data
X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(
    X_meta, y_meta, test_size=0.2, random_state=42, stratify=y_meta
)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_meta)
y_train_tensor = torch.FloatTensor(y_train_meta).unsqueeze(1)
X_test_tensor = torch.FloatTensor(X_test_meta)
y_test_tensor = torch.FloatTensor(y_test_meta).unsqueeze(1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Initialize model, loss, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MetaClassifier(input_size=15, hidden_size=32).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
train_losses = []
print("Training Neural Network...")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_loss)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    X_test_tensor = X_test_tensor.to(device)
    y_pred_proba = model(X_test_tensor).cpu().numpy()
    y_pred_nn = (y_pred_proba > 0.5).astype(int).flatten()

# Metrics
accuracy = accuracy_score(y_test_meta, y_pred_nn)
print(f"\nNeural Network Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_meta, y_pred_nn))

# Confusion Matrix
disp = ConfusionMatrixDisplay.from_predictions(y_test_meta, y_pred_nn, cmap=plt.cm.Blues)
disp.ax_.set_title(f'Neural Network Meta-Model - Accuracy: {accuracy:.2f}')
plt.show()

# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses)
plt.title('Training Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

# Compare predictions with Logistic Regression
print("\n" + "="*60)
print("COMPARISON: Neural Network vs Logistic Regression")
print("="*60)
print(f"Neural Network Accuracy: {accuracy:.4f}")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test_meta, y_pred_meta):.4f}")