In [1]:
# Montar Google Drive y configurar el directorio
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/Dataset/")

# Instalar versiones específicas de las dependencias para evitar conflictos
!pip install --no-deps numpy==1.24.3
!pip install --no-deps pandas==2.0.3
!pip install --no-deps matplotlib==3.7.1
!pip install --no-deps seaborn==0.12.2
!pip install --no-deps scikit-learn==1.3.0

# Importaciones
from typing import Tuple, Dict, Any
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, r2_score
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.impute import SimpleImputer

class CarPriceAnalysis:
    def __init__(self):
        self.data = None
        self.X = None
        self.y = None
        self.y_binary = None
        self.required_features = [
            'Car_Name', 'Selling_Price', 'Fuel_Type',
            'Transmission', 'Seller_Type'
        ]

    def load_data(self) -> None:
        """Carga y combina los datasets de automóviles"""
        datasets = []
        for file in [
            'car data.csv',
            'CAR DETAILS FROM CAR DEKHO.csv',
            'Car details v3.csv',
            'car details v4.csv'
        ]:
            try:
                df = pd.read_csv(file)
                datasets.append(df)
            except Exception as e:
                print(f"Error al cargar {file}: {str(e)}")

        if not datasets:
            raise ValueError("No se pudo cargar ningún dataset")

        self.data = pd.concat(datasets, ignore_index=True)

    def perform_eda(self) -> None:
        """Realiza análisis exploratorio de datos"""
        if self.data is None:
            raise ValueError("Datos no cargados. Ejecute load_data() primero.")

        print("=== Análisis Exploratorio de Datos ===")
        print("\nEstadísticas descriptivas:")
        print(self.data.describe())

        # Visualización básica
        plt.style.use('seaborn')
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Distribución de precios
        self.data['Selling_Price'].hist(ax=ax1, bins=30)
        ax1.set_title('Distribución de Precios')
        ax1.set_xlabel('Precio')
        ax1.set_ylabel('Frecuencia')

        # Relación año vs precio
        ax2.scatter(self.data['Year'], self.data['Selling_Price'], alpha=0.5)
        ax2.set_title('Año vs Precio')
        ax2.set_xlabel('Año')
        ax2.set_ylabel('Precio')

        plt.tight_layout()
        plt.show()

    def preprocess_data(self) -> None:
        """Preprocesa los datos para el modelado"""
        if self.data is None:
            raise ValueError("Datos no cargados. Ejecute load_data() primero.")

        # Verificar columnas requeridas
        missing_cols = set(self.required_features) - set(self.data.columns)
        if missing_cols:
            raise ValueError(f"Columnas faltantes: {missing_cols}")

        # Limpieza de datos
        self.data = self.data.dropna(subset=self.required_features)
        if self.data.empty:
            raise ValueError("DataFrame vacío después de la limpieza")

        # Codificación de variables categóricas
        le = LabelEncoder()
        for col in ['Fuel_Type', 'Transmission', 'Seller_Type']:
            self.data[col] = le.fit_transform(self.data[col])

        # Preparación de características
        self.X = self.data.drop(['Car_Name', 'Selling_Price'], axis=1)
        self.y = self.data['Selling_Price']
        self.y_binary = (self.y > self.y.median()).astype(int)

    def train_models(self) -> Dict[str, Any]:
        """Entrena y evalúa los modelos"""
        if self.X is None or self.y is None:
            raise ValueError("Datos no preparados. Ejecute preprocess_data() primero.")

        # Imputación y selección de características
        imputer = SimpleImputer(strategy='mean')
        X_imputed = imputer.fit_transform(self.X)

        selector = SelectKBest(f_regression, k=5)
        X_selected = selector.fit_transform(X_imputed, self.y)

        # División de datos
        X_train, X_test, y_train, y_test = train_test_split(
            X_selected, self.y, test_size=0.2, random_state=42
        )

        # Escalado
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Entrenamiento y evaluación
        models = {}

        # Regresión Lineal
        lin_reg = LinearRegression()
        lin_reg.fit(X_train_scaled, y_train)
        y_pred_lin = lin_reg.predict(X_test_scaled)

        models['linear'] = {
            'model': lin_reg,
            'predictions': y_pred_lin,
            'metrics': {
                'mse': mean_squared_error(y_test, y_pred_lin),
                'r2': r2_score(y_test, y_pred_lin)
            }
        }

        # Árbol de Decisión
        dt = DecisionTreeRegressor(random_state=42)
        dt.fit(X_train_scaled, y_train)
        y_pred_dt = dt.predict(X_test_scaled)

        models['tree'] = {
            'model': dt,
            'predictions': y_pred_dt,
            'metrics': {
                'mse': mean_squared_error(y_test, y_pred_dt),
                'r2': r2_score(y_test, y_pred_dt)
            }
        }

        return models, y_test

    def visualize_results(self, models: Dict[str, Any], y_test: np.ndarray) -> None:
        """Visualiza los resultados de los modelos"""
        plt.figure(figsize=(15, 5))

        # Comparación de predicciones
        for i, (name, model_info) in enumerate(models.items(), 1):
            plt.subplot(1, 2, i)
            plt.scatter(y_test, model_info['predictions'], alpha=0.5)
            plt.plot([y_test.min(), y_test.max()],
                    [y_test.min(), y_test.max()],
                    'r--', lw=2)
            plt.title(f'Predicciones vs Reales - {name.capitalize()}')
            plt.xlabel('Valores Reales')
            plt.ylabel('Predicciones')

        plt.tight_layout()
        plt.show()

def main():
    """Función principal para ejecutar el análisis"""
    try:
        # Inicialización y carga de datos
        analyzer = CarPriceAnalysis()
        analyzer.load_data()

        # Análisis exploratorio
        analyzer.perform_eda()

        # Preprocesamiento
        analyzer.preprocess_data()

        # Entrenamiento y evaluación
        models, y_test = analyzer.train_models()

        # Resultados
        print("\n=== Resultados de los Modelos ===")
        for name, model_info in models.items():
            print(f"\nModelo: {name.capitalize()}")
            for metric_name, value in model_info['metrics'].items():
                print(f"{metric_name}: {value:.4f}")

        # Visualizaciones
        analyzer.visualize_results(models, y_test)

    except Exception as e:
        print(f"Error en la ejecución: {str(e)}")

if __name__ == "__main__":
    main()

ValueError: mount failed