<a href="https://colab.research.google.com/github/Persa053/CG/blob/main/TPC1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [210]:
from typing import Tuple, Sequence

import numpy as np
import pandas as pd


class Dataset:
    def __init__(self, X: np.ndarray = None, y: np.ndarray = None, features: Sequence[str] = None, types: Sequence[str] = None ,  label: str = None):
        """
        Dataset represents a machine learning tabular dataset.

        Parameters
        ----------
        X: numpy.ndarray (n_samples, n_features)
            The feature matrix
        y: numpy.ndarray (n_samples, 1)
            The label vector
        features: list of str (n_features)
            The feature names
        label: str (1)
            The label name
        """
        features=features

        if y is not None and label is None:
            label = "y"

        self.X = X
        self.y = y
        self.features = features
        self.types = types
        self.label = label

    def shape(self) -> Tuple[int, int]:
        """
        Returns the shape of the dataset
        Returns
        -------
        tuple (n_samples, n_features)
        """
        return self.X.shape[::-1]

    def has_label(self) -> bool:
        """
        Returns True if the dataset has a label
        Returns
        -------
        bool
        """
        return self.y is not None

    def get_classes(self):
        if self.y is None:
            raise ValueError("Dataset does not have a label")
        return np.unique(self.y)

    def get_column(self, s: str) -> np.array:
        return self.X[self.features.index(s)]

    def get_type(self, s: str) -> str:
        return self.types[self.features.index(s)]

    def null_counter(self, s:str) -> int:

      if (self.get_type(s) == 'object'):
        res = np.count_nonzero(self.get_column(s)== '')
      else:
        print(self.get_type(s))
        res = np.count_nonzero(pd.isna(self.get_column(s)))
          
      return res

    

    def get_mean(self) -> np.ndarray:
        """
        Returns the mean of each feature
        Returns
        -------
        numpy.ndarray (n_features)
        """
        return np.nanmean(self.X, axis=0)

    def get_variance(self) -> np.ndarray:
        """
        Returns the variance of each feature
        Returns
        -------
        numpy.ndarray (n_features)
        """
        return np.nanvar(self.X, axis=0)

    def get_median(self) -> np.ndarray:
        """
        Returns the median of each feature
        Returns
        -------
        numpy.ndarray (n_features)
        """
        return np.nanmedian(self.X, axis=0)

    def get_min(self) -> np.ndarray:
        """
        Returns the minimum of each feature
        Returns
        -------
        numpy.ndarray (n_features)
        """
        return np.nanmin(self.X, axis=0)

    def get_max(self) -> np.ndarray:
        """
        Returns the maximum of each feature
        Returns
        -------
        numpy.ndarray (n_features)
        """
        return np.nanmax(self.X, axis=0)

    def summary(self) -> pd.DataFrame:
        """
        Returns a summary of the dataset
        Returns
        -------
        pandas.DataFrame (n_features, 5)
        """

        # fazer um dataframe com estas merdas para cada cenas com nrs



        for f in self.features:
          if (self.features.value != object ):
            data = {
                "mean": self.get_mean(),
                "median": self.get_median(),
                "min": self.get_min(),
                "max": self.get_max(),
                "var": self.get_variance()
            }
        return pd.DataFrame.from_dict(data, orient="index", columns=[])
  
        

    @classmethod
    def from_dataframe(cls, df: pd.DataFrame, label: str = None):
        """
        Creates a Dataset object from a pandas DataFrame

        Parameters
        ----------
        df: pandas.DataFrame
            The DataFrame
        label: str
            The label name

        Returns
        -------
        Dataset

        
        """
        features = []
        types = []
        
            
        if label:
            y = df[label].to_numpy()
            df = df.drop(label, axis=1)
        else:
            y = None

        
        for d in df.columns:
              if (df[d].dtype == object):
                df[d] = df[d].replace(np.nan, '')
              features.append(d)
              types.append(str(df[d].dtype))
        X = df.transpose().to_numpy()      

        return cls(X, y, features=features,types=types, label=label)

    def to_dataframe(self) -> pd.DataFrame:
        """
        Converts the dataset to a pandas DataFrame

        Returns
        -------
        pandas.DataFrame
        """
        if self.y is None:
            return pd.DataFrame(self.X, columns=self.features)
        else:
            df = pd.DataFrame(self.X, columns=self.features)
            df[self.label] = self.y
            return df


In [216]:
from typing import Tuple, Sequence

import numpy as np
import pandas as pd

data = pd.read_csv('/content/titanic_dataset.csv')

cena = Dataset.from_dataframe(data)


cena.null_counter('Cabin')



687