### Captura de datos

In [7]:
import cv2
import mediapipe as mp
import csv
import time

In [15]:


# Inicialización de MediaPipe
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

# Configuración del modelo
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=1,  # Solo detectar una mano
                       min_detection_confidence=0.5,
                       min_tracking_confidence=0.5)

# Captura de video (cámara web)
cap = cv2.VideoCapture(0)

# Archivo CSV para guardar las coordenadas
output_file = "coordenadas_mano_numero6.csv"

# Crear encabezados para el CSV (x1, y1, z1, ..., x21, y21, z21)
header = [f"{coord}{i+1}" for i in range(21) for coord in ("x", "y", "z")]

# Crear o sobrescribir el archivo CSV
with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)

print("Presiona 'q' para empezar/pausar el guardado de coordenadas.")
print("Presiona 'ESC' para salir.")

# Bandera para iniciar/pausar el guardado
save_coordinates = False

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("No se pudo acceder a la cámara.")
        break

    # Convertir la imagen a RGB (MediaPipe utiliza imágenes RGB)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Procesar la imagen para detectar manos
    results = hands.process(frame_rgb)

    # Verificar si se detectó una mano
    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]  # Solo la primera mano detectada

        # Dibujar las landmarks de la mano en la imagen
        mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        # Obtener coordenadas y guardarlas si está habilitado
        if save_coordinates:
            # Obtener todas las coordenadas x, y, z en una lista
            coords = []
            for landmark in hand_landmarks.landmark:
                coords.extend([landmark.x, landmark.y, landmark.z])

            # Guardar las coordenadas en el archivo CSV
            with open(output_file, mode='a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(coords)

    # Mostrar la imagen con las detecciones
    cv2.imshow('Seguimiento de Mano', frame)

    # Control de teclas
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        save_coordinates = not save_coordinates
        if save_coordinates:
            print("Guardando coordenadas...")
        else:
            print("Guardado pausado.")
    elif key == 27:  # Tecla 'ESC' para salir
        break

# Liberar recursos
cap.release()
cv2.destroyAllWindows()
hands.close()


Presiona 'q' para empezar/pausar el guardado de coordenadas.
Presiona 'ESC' para salir.
Guardando coordenadas...
Guardado pausado.


## Entrenamiento de red neuronal

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


In [2]:

if torch.cuda.is_available():
    print("✅ CUDA está disponible")
    print(f"🔹 GPU detectada: {torch.cuda.get_device_name(0)}")
else:
    print("❌ CUDA no está disponible")


✅ CUDA está disponible
🔹 GPU detectada: NVIDIA GeForce RTX 4060 Laptop GPU


In [3]:

# 📌 1. Cargar los 5 CSVs y asignar etiquetas
csv_files = {
    "coordenadas_mano_numero1.csv": 0,
    "coordenadas_mano_numero2.csv": 1,
    "coordenadas_mano_numero3.csv": 2,
    "coordenadas_mano_numero4.csv": 3,
    "coordenadas_mano_numero5.csv": 4,
}

data_list = []
for file, label in csv_files.items():
    df = pd.read_csv(file, header=None)  # No hay encabezados
    df["label"] = label  # Agregar la etiqueta de clase
    data_list.append(df)

data = pd.concat(data_list, ignore_index=True)  # Combinar todos los archivos

# 📌 2. Convertir datos en tensores
X = data.iloc[:, :-1].values.astype(np.float32)  # 63 entradas
y = data.iloc[:, -1].values.astype(np.int64)  # Etiquetas (1-5)

# 📌 3. Dividir en entrenamiento y validación (80%-20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:

# 📌 4. Crear Dataset en PyTorch
class HandGestureDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = HandGestureDataset(X_train, y_train)
val_dataset = HandGestureDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# 📌 5. Definir la Red Neuronal
class HandGestureModel(nn.Module):
    def __init__(self):
        super(HandGestureModel, self).__init__()
        self.fc1 = nn.Linear(63, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 5)  # 5 clases
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [5]:

# 📌 6. Inicializar el Modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HandGestureModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [6]:

# 📌 7. Entrenar la Red Neuronal
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"🔹 Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")


🔹 Epoch [1/30], Loss: 0.6630
🔹 Epoch [2/30], Loss: 0.1520
🔹 Epoch [3/30], Loss: 0.0811
🔹 Epoch [4/30], Loss: 0.0569
🔹 Epoch [5/30], Loss: 0.0505
🔹 Epoch [6/30], Loss: 0.0474
🔹 Epoch [7/30], Loss: 0.0379
🔹 Epoch [8/30], Loss: 0.0441
🔹 Epoch [9/30], Loss: 0.0361
🔹 Epoch [10/30], Loss: 0.0392
🔹 Epoch [11/30], Loss: 0.0336
🔹 Epoch [12/30], Loss: 0.0297
🔹 Epoch [13/30], Loss: 0.0365
🔹 Epoch [14/30], Loss: 0.0288
🔹 Epoch [15/30], Loss: 0.0288
🔹 Epoch [16/30], Loss: 0.0336
🔹 Epoch [17/30], Loss: 0.0307
🔹 Epoch [18/30], Loss: 0.0262
🔹 Epoch [19/30], Loss: 0.0338
🔹 Epoch [20/30], Loss: 0.0203
🔹 Epoch [21/30], Loss: 0.0271
🔹 Epoch [22/30], Loss: 0.0242
🔹 Epoch [23/30], Loss: 0.0290
🔹 Epoch [24/30], Loss: 0.0285
🔹 Epoch [25/30], Loss: 0.0252
🔹 Epoch [26/30], Loss: 0.0258
🔹 Epoch [27/30], Loss: 0.0300
🔹 Epoch [28/30], Loss: 0.0272
🔹 Epoch [29/30], Loss: 0.0299
🔹 Epoch [30/30], Loss: 0.0216


In [7]:

# 📌 8. Evaluar el Modelo
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f"🔹 Precisión en validación: {100 * correct / total:.2f}%")


🔹 Precisión en validación: 99.50%


In [None]:

# 📌 9. Guardar el Modelo
torch.save(model.state_dict(), "modelo_gestos.pth")
print("✅ Modelo guardado como 'modelo_gestos.pth'")

✅ Modelo guardado como 'modelo_gestos.pth'


## Integración

In [11]:
import cv2
import mediapipe as mp
import torch
import torch.nn as nn
import numpy as np

## Con porcentajes sobre la imagen

In [None]:


# 📌 Definir la Red Neuronal
class HandGestureModel(nn.Module):
    def __init__(self):
        super(HandGestureModel, self).__init__()
        self.fc1 = nn.Linear(63, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 5)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 📌 Cargar el modelo entrenado
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = HandGestureModel().to(device)
model.load_state_dict(torch.load("modelo_gestos.pth", map_location=device))
model.eval()

# 📌 Inicializar MediaPipe para detección de manos
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5)

# 📌 Captura de video
cap = cv2.VideoCapture(0)
class_labels = ["numero 1", "numero 2", "numero 3", "numero 4", "numero 5"]  # Nombres de las clases

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("No se pudo acceder a la cámara.")
        break

    # Convertir a RGB para MediaPipe
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    # 📌 Dibujar la mano y extraer coordenadas
    coords = None
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Obtener coordenadas x, y, z
            coords = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()

    # 📌 Realizar predicción si hay coordenadas
    if coords is not None and len(coords) == 63:
        input_tensor = torch.tensor(coords, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_tensor)
            probabilities = torch.softmax(output, dim=1).cpu().numpy()[0]

        # Obtener la clase con mayor probabilidad
        predicted_class = np.argmax(probabilities)
        predicted_label = class_labels[predicted_class]

        # 📌 Dibujar cuadro con las probabilidades
        bar_x = frame.shape[1] - 200
        bar_y = 50
        bar_width = 150
        bar_height = 20

        for i, (label, prob) in enumerate(zip(class_labels, probabilities)):
            bar_fill = int(prob * bar_width)  # Escalar al tamaño de la barra
            cv2.rectangle(frame, (bar_x, bar_y + i * 30), (bar_x + bar_fill, bar_y + i * 30 + bar_height), (0, 255, 0), -1)
            cv2.rectangle(frame, (bar_x, bar_y + i * 30), (bar_x + bar_width, bar_y + i * 30 + bar_height), (255, 255, 255), 2)
            cv2.putText(frame, f"{label}: {prob:.2f}", (bar_x + 5, bar_y + i * 30 + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)

        # 📌 Mostrar el resultado en pantalla
        cv2.putText(frame, f"Prediccion: {predicted_label}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)

    # 📌 Mostrar la imagen con detecciones
    cv2.imshow('Detectar numero de dedos', frame)

    # Salir con 'ESC'
    if cv2.waitKey(1) & 0xFF == 27:
        break

# 📌 Liberar recursos
cap.release()
cv2.destroyAllWindows()
hands.close()


cuda


  model.load_state_dict(torch.load("modelo_gestos.pth", map_location=device))


## Con panel a la derecha

In [17]:


# 📌 Definir la Red Neuronal
class HandGestureModel(nn.Module):
    def __init__(self):
        super(HandGestureModel, self).__init__()
        self.fc1 = nn.Linear(63, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 5)  # 5 clases
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 📌 Cargar el modelo entrenado
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HandGestureModel().to(device)
model.load_state_dict(torch.load("modelo_gestos.pth", map_location=device))
model.eval()

# 📌 Inicializar MediaPipe para detección de manos
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5)

# 📌 Captura de video
cap = cv2.VideoCapture(0)
class_labels = ["numero 1", "numero 2", "numero 3", "numero 4", "numero 5"]  # Nombres de las clases

# 📌 Tamaño del panel negro
panel_width = 250  # Ancho del panel de información

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("No se pudo acceder a la cámara.")
        break

    # Convertir a RGB para MediaPipe
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    # 📌 Dibujar la mano y extraer coordenadas
    coords = None
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Obtener coordenadas x, y, z
            coords = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()

    # 📌 Crear el panel negro a la derecha
    panel = np.zeros((frame.shape[0], panel_width, 3), dtype=np.uint8)

    # 📌 Realizar predicción si hay coordenadas
    if coords is not None and len(coords) == 63:
        input_tensor = torch.tensor(coords, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_tensor)
            probabilities = torch.softmax(output, dim=1).cpu().numpy()[0]

        # Obtener la clase con mayor probabilidad
        predicted_class = np.argmax(probabilities)
        predicted_label = class_labels[predicted_class]

        # 📌 Dibujar texto y barras en el panel negro
        bar_x = 20
        bar_y = 50
        bar_width = panel_width - 40  # Ancho de la barra
        bar_height = 30

        for i, (label, prob) in enumerate(zip(class_labels, probabilities)):
            bar_fill = int(prob * bar_width)  # Escalar al tamaño de la barra
            cv2.rectangle(panel, (bar_x, bar_y + i * 50), (bar_x + bar_fill, bar_y + i * 50 + bar_height), (0, 255, 0), -1)
            cv2.rectangle(panel, (bar_x, bar_y + i * 50), (bar_x + bar_width, bar_y + i * 50 + bar_height), (255, 255, 255), 2)
            cv2.putText(panel, f"{label}: {prob:.2f}", (bar_x + 5, bar_y + i * 50 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)

        # 📌 Mostrar el nombre del gesto detectado
        cv2.putText(panel, f"Prediccion:", (bar_x, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
        cv2.putText(panel, predicted_label, (bar_x + 100, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

    # 📌 Concatenar la imagen y el panel negro
    combined_frame = np.hstack((frame, panel))

    # 📌 Mostrar la imagen con detecciones y el panel de información
    cv2.imshow('Deteccion de numeros', combined_frame)

    # Salir con 'ESC'
    if cv2.waitKey(1) & 0xFF == 27:
        break

# 📌 Liberar recursos
cap.release()
cv2.destroyAllWindows()
hands.close()


  model.load_state_dict(torch.load("modelo_gestos.pth", map_location=device))
