<a href="https://colab.research.google.com/github/Saeidhoseinipour/100Data/blob/main/100Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install pydataset
!pip install lifelines

Collecting pydataset
  Downloading pydataset-0.2.0.tar.gz (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pydataset
  Building wheel for pydataset (setup.py) ... [?25l[?25hdone
  Created wheel for pydataset: filename=pydataset-0.2.0-py3-none-any.whl size=15939417 sha256=b4bdfe708ead782a6fa9e527c289698fb11edea431c48a7de449cc4edbb05067
  Stored in directory: /root/.cache/pip/wheels/29/93/3f/af54c413cecaac292940342c61882d2a8848674175d0bb0889
Successfully built pydataset
Installing collected packages: pydataset
Successfully installed pydataset-0.2.0


In [6]:
import pandas as pd
import numpy as np
import logging
from sklearn.datasets import make_classification, load_diabetes, load_breast_cancer
from pydataset import data
import seaborn as sns

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class BioinformaticsDatasetCleaner:
    def __init__(self, dataset_names=None):
        """
        Initialize the class with a list of dataset names or use built-in datasets.
        :param dataset_names: List of dataset filenames (e.g., ['data1.csv', 'data2.xlsx']).
        """
        self.dataset_names = dataset_names
        self.cleaned_datasets = {}
        self.builtin_datasets = {
            'titanic': {
                'loader': self.load_titanic_dataset,
                'type': 'qualitative',
                'features': ['class', 'age', 'sex', 'survived']
            },
            'cancer': {
                'loader': self.load_cancer_dataset,
                'type': 'qualitative',
                'features': ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'target']
            },
            'synthetic': {
                'loader': self.load_synthetic_dataset,
                'type': 'quantitative',
                'features': [f'feature_{i}' for i in range(10)] + ['target']
            },
            'diabetes': {
                'loader': self.load_diabetes_dataset,
                'type': 'quantitative',
                'features': ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6', 'target']
            },
            'iris': {
                'loader': self.load_iris_dataset,
                'type': 'quantitative',
                'features': ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
            },
            'tips': {
                'loader': self.load_tips_dataset,
                'type': 'quantitative',
                'features': ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
            },
            'flights': {
                'loader': self.load_flights_dataset,
                'type': 'quantitative',
                'features': ['year', 'month', 'passengers']
            },
            'mtcars': {
                'loader': self.load_mtcars_dataset,
                'type': 'quantitative',
                'features': ['mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
            },
            'diamonds': {
                'loader': self.load_diamonds_dataset,
                'type': 'quantitative',
                'features': ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']
            },
            'sleep': {
                'loader': self.load_sleep_dataset,
                'type': 'quantitative',
                'features': ['extra', 'group', 'ID']
            },
            'gene_expression': {
                'loader': self.load_gene_expression_dataset,
                'type': 'quantitative',
                'features': [f'gene_{i}' for i in range(100)] + ['sample']
            },
            'cell_cycle': {
                'loader': self.load_cell_cycle_dataset,
                'type': 'qualitative',
                'features': ['cell', 'phase']
            },
            'protein_interaction': {
                'loader': self.load_protein_interaction_dataset,
                'type': 'network',
                'features': [f'protein_{i}' for i in range(10)]
            },
            'tcga_brca': {
                'loader': self.load_tcga_brca_dataset,
                'type': 'quantitative',
                'features': [f'gene_{i}' for i in range(100)] + ['patient']
            }
        }

    def read_dataset(self, dataset_name):
        """
        Read a dataset based on its file extension or load a built-in dataset.
        :param dataset_name: Name of the dataset file or built-in dataset.
        :return: Pandas DataFrame.
        """
        if dataset_name in self.builtin_datasets:
            logging.info(f"Loading built-in dataset: {dataset_name}")
            return self.builtin_datasets[dataset_name]['loader']()
        elif dataset_name.endswith('.csv'):
            logging.info(f"Reading CSV file: {dataset_name}")
            return pd.read_csv(dataset_name)
        elif dataset_name.endswith('.xlsx') or dataset_name.endswith('.xls'):
            logging.info(f"Reading Excel file: {dataset_name}")
            return pd.read_excel(dataset_name)
        elif dataset_name.endswith('.json'):
            logging.info(f"Reading JSON file: {dataset_name}")
            return pd.read_json(dataset_name)
        elif dataset_name.endswith('.parquet'):
            logging.info(f"Reading Parquet file: {dataset_name}")
            return pd.read_parquet(dataset_name)
        elif dataset_name.endswith('.dta'):
            logging.info(f"Reading Stata file: {dataset_name}")
            return pd.read_stata(dataset_name)
        else:
            raise ValueError(f"Unsupported file format or dataset: {dataset_name}")

    def clean_dataset(self, df):
        """
        Clean a dataset by handling missing values, duplicates, and formatting.
        :param df: Input Pandas DataFrame.
        :return: Cleaned Pandas DataFrame.
        """
        logging.info("Cleaning dataset...")
        # Handle missing values
        df.ffill(inplace=True)  # Forward fill missing values
        df.bfill(inplace=True)  # Backward fill remaining missing values

        # Remove duplicates
        df.drop_duplicates(inplace=True)

        # Convert columns to appropriate data types
        for col in df.columns:
            try:
                df[col] = pd.to_numeric(df[col])
            except ValueError:
                pass  # Ignore columns that cannot be converted to numeric

        logging.info("Dataset cleaned successfully.")
        return df

    def process_all_datasets(self):
        """
        Process all datasets in the list.
        :return: Dictionary of cleaned datasets with metadata.
        """
        if self.dataset_names is None:
            self.dataset_names = list(self.builtin_datasets.keys())

        for dataset_name in self.dataset_names:
            logging.info(f"Processing dataset: {dataset_name}")
            df = self.read_dataset(dataset_name)
            cleaned_df = self.clean_dataset(df)
            self.cleaned_datasets[dataset_name] = {
                'data': cleaned_df,
                'type': self.builtin_datasets[dataset_name]['type'],
                'features': self.builtin_datasets[dataset_name]['features']
            }
            logging.info(f"Cleaned dataset: {dataset_name}\n{cleaned_df.head()}\n")

        return self.cleaned_datasets

    def get_cleaned_dataset(self, dataset_name):
        """
        Get a specific cleaned dataset by name.
        :param dataset_name: Name of the dataset.
        :return: Cleaned Pandas DataFrame, type, and features.
        """
        if dataset_name in self.cleaned_datasets:
            return self.cleaned_datasets[dataset_name]
        else:
            raise KeyError(f"Dataset {dataset_name} not found in cleaned datasets.")

    # Built-in dataset loaders
    def load_titanic_dataset(self):
        """Load the Titanic dataset from pydataset."""
        return data('titanic')

    def load_cancer_dataset(self):
        """Load the Breast Cancer dataset from scikit-learn."""
        data = load_breast_cancer(as_frame=True)
        return data.frame

    def load_synthetic_dataset(self):
        """Generate a synthetic dataset for classification."""
        X, y = make_classification(n_samples=100, n_features=10, random_state=42)
        return pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)]).assign(target=y)

    def load_diabetes_dataset(self):
        """Load the Diabetes dataset from scikit-learn."""
        return load_diabetes(as_frame=True).frame

    def load_iris_dataset(self):
        """Load the Iris dataset from seaborn."""
        return sns.load_dataset('iris')

    def load_tips_dataset(self):
        """Load the Tips dataset from seaborn."""
        return sns.load_dataset('tips')

    def load_flights_dataset(self):
        """Load the Flights dataset from seaborn."""
        return sns.load_dataset('flights')

    def load_mtcars_dataset(self):
        """Load the mtcars dataset from pydataset."""
        return data('mtcars')

    def load_diamonds_dataset(self):
        """Load the Diamonds dataset from pydataset."""
        return data('diamonds')

    def load_sleep_dataset(self):
        """Load the Sleep dataset from pydataset."""
        return data('sleep')

    def load_gene_expression_dataset(self):
        """Load a synthetic gene expression dataset."""
        genes = [f'gene_{i}' for i in range(100)]
        samples = [f'sample_{i}' for i in range(20)]
        data = np.random.rand(100, 20)  # Random gene expression values
        return pd.DataFrame(data, index=genes, columns=samples).reset_index().rename(columns={'index': 'gene'})

    def load_cell_cycle_dataset(self):
        """Load a synthetic cell cycle dataset."""
        cells = [f'cell_{i}' for i in range(50)]
        phases = ['G1', 'S', 'G2', 'M']
        data = np.random.choice(phases, size=50)  # Random cell cycle phases
        return pd.DataFrame({'cell': cells, 'phase': data})

    def load_protein_interaction_dataset(self):
        """Load a synthetic protein-protein interaction dataset."""
        proteins = [f'protein_{i}' for i in range(10)]
        interactions = np.random.randint(0, 2, size=(10, 10))  # Random binary interactions
        return pd.DataFrame(interactions, index=proteins, columns=proteins).reset_index().rename(columns={'index': 'protein'})

    def load_tcga_brca_dataset(self):
        """Load a synthetic TCGA BRCA dataset."""
        genes = [f'gene_{i}' for i in range(100)]
        patients = [f'patient_{i}' for i in range(50)]
        data = np.random.rand(100, 50)  # Random gene expression values
        return pd.DataFrame(data, index=genes, columns=patients).reset_index().rename(columns={'index': 'gene'})

# Example usage
if __name__ == "__main__":
    # Initialize the BioinformaticsDatasetCleaner class
    cleaner = BioinformaticsDatasetCleaner()

    # Process all built-in datasets
    cleaned_datasets = cleaner.process_all_datasets()

    # Access a specific cleaned dataset
    cleaned_gene_expression = cleaner.get_cleaned_dataset('gene_expression')
    print("Cleaned Gene Expression Dataset:\n", cleaned_gene_expression['data'].head())
    print("Dataset Type:", cleaned_gene_expression['type'])
    print("Dataset Features:", cleaned_gene_expression['features'])

Cleaned Gene Expression Dataset:
      gene  sample_0  sample_1  sample_2  sample_3  sample_4  sample_5  sample_6  sample_7  \
0  gene_0  0.771814  0.825179  0.048663  0.566476  0.760438  0.190926  0.866722  0.565902   
1  gene_1  0.169331  0.106842  0.165393  0.606631  0.062212  0.518681  0.761699  0.417028   
2  gene_2  0.811991  0.593594  0.395272  0.768286  0.516349  0.525414  0.241548  0.881177   
3  gene_3  0.572478  0.690040  0.103846  0.078084  0.106014  0.180196  0.434550  0.692425   
4  gene_4  0.873184  0.621802  0.396909  0.371167  0.242977  0.916439  0.050170  0.648201   

   sample_8  ...  sample_10  sample_11  sample_12  sample_13  sample_14  sample_15  sample_16  \
0  0.074470  ...   0.387757   0.623801   0.041448   0.995097   0.115277   0.142609   0.795275   
1  0.748868  ...   0.478421   0.117448   0.659518   0.749848   0.859487   0.487745   0.694591   
2  0.037793  ...   0.566892   0.504087   0.989449   0.788174   0.403537   0.028092   0.011691   
3  0.998204  ...   

In [8]:
import pandas as pd
import numpy as np
import logging
from sklearn.datasets import make_classification, load_diabetes, load_breast_cancer
from pydataset import data
import seaborn as sns

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class BioinformaticsDatasetCleaner:
    def __init__(self, dataset_names=None):
        """
        Initialize the class with a list of dataset names or use built-in datasets.
        :param dataset_names: List of dataset filenames (e.g., ['data1.csv', 'data2.xlsx']).
        """
        self.dataset_names = dataset_names
        self.cleaned_datasets = {}
        self.builtin_datasets = {
            'titanic': {
                'loader': self.load_titanic_dataset,
                'type': 'qualitative',
                'features': ['class', 'age', 'sex', 'survived']
            },
            'cancer': {
                'loader': self.load_cancer_dataset,
                'type': 'qualitative',
                'features': ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'target']
            },
            'synthetic': {
                'loader': self.load_synthetic_dataset,
                'type': 'quantitative',
                'features': [f'feature_{i}' for i in range(10)] + ['target']
            },
            'diabetes': {
                'loader': self.load_diabetes_dataset,
                'type': 'quantitative',
                'features': ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6', 'target']
            },
            'iris': {
                'loader': self.load_iris_dataset,
                'type': 'quantitative',
                'features': ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
            },
            'tips': {
                'loader': self.load_tips_dataset,
                'type': 'quantitative',
                'features': ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
            },
            'flights': {
                'loader': self.load_flights_dataset,
                'type': 'quantitative',
                'features': ['year', 'month', 'passengers']
            },
            'mtcars': {
                'loader': self.load_mtcars_dataset,
                'type': 'quantitative',
                'features': ['mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
            },
            'diamonds': {
                'loader': self.load_diamonds_dataset,
                'type': 'quantitative',
                'features': ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']
            },
            'sleep': {
                'loader': self.load_sleep_dataset,
                'type': 'quantitative',
                'features': ['extra', 'group', 'ID']
            },
            'gene_expression': {
                'loader': self.load_gene_expression_dataset,
                'type': 'quantitative',
                'features': [f'gene_{i}' for i in range(100)] + ['sample']
            },
            'cell_cycle': {
                'loader': self.load_cell_cycle_dataset,
                'type': 'qualitative',
                'features': ['cell', 'phase']
            },
            'protein_interaction': {
                'loader': self.load_protein_interaction_dataset,
                'type': 'network',
                'features': [f'protein_{i}' for i in range(10)]
            },
            'tcga_brca': {
                'loader': self.load_tcga_brca_dataset,
                'type': 'quantitative',
                'features': [f'gene_{i}' for i in range(100)] + ['patient']
            }
        }

    def read_dataset(self, dataset_name):
        """
        Read a dataset based on its file extension or load a built-in dataset.
        :param dataset_name: Name of the dataset file or built-in dataset.
        :return: Pandas DataFrame.
        """
        if dataset_name in self.builtin_datasets:
            logging.info(f"Loading built-in dataset: {dataset_name}")
            return self.builtin_datasets[dataset_name]['loader']()
        elif dataset_name.endswith('.csv'):
            logging.info(f"Reading CSV file: {dataset_name}")
            return pd.read_csv(dataset_name)
        elif dataset_name.endswith('.xlsx') or dataset_name.endswith('.xls'):
            logging.info(f"Reading Excel file: {dataset_name}")
            return pd.read_excel(dataset_name)
        elif dataset_name.endswith('.json'):
            logging.info(f"Reading JSON file: {dataset_name}")
            return pd.read_json(dataset_name)
        elif dataset_name.endswith('.parquet'):
            logging.info(f"Reading Parquet file: {dataset_name}")
            return pd.read_parquet(dataset_name)
        elif dataset_name.endswith('.dta'):
            logging.info(f"Reading Stata file: {dataset_name}")
            return pd.read_stata(dataset_name)
        else:
            raise ValueError(f"Unsupported file format or dataset: {dataset_name}")

    def clean_dataset(self, df):
        """
        Clean a dataset by handling missing values, duplicates, and formatting.
        :param df: Input Pandas DataFrame.
        :return: Cleaned Pandas DataFrame.
        """
        logging.info("Cleaning dataset...")
        # Handle missing values
        df.ffill(inplace=True)  # Forward fill missing values
        df.bfill(inplace=True)  # Backward fill remaining missing values

        # Remove duplicates
        df.drop_duplicates(inplace=True)

        # Convert columns to appropriate data types
        for col in df.columns:
            try:
                df[col] = pd.to_numeric(df[col])
            except ValueError:
                pass  # Ignore columns that cannot be converted to numeric

        logging.info("Dataset cleaned successfully.")
        return df

    def process_all_datasets(self):
        """
        Process all datasets in the list.
        :return: Dictionary of cleaned datasets with metadata.
        """
        if self.dataset_names is None:
            self.dataset_names = list(self.builtin_datasets.keys())

        for dataset_name in self.dataset_names:
            logging.info(f"Processing dataset: {dataset_name}")
            df = self.read_dataset(dataset_name)
            cleaned_df = self.clean_dataset(df)
            self.cleaned_datasets[dataset_name] = {
                'data': cleaned_df,
                'type': self.builtin_datasets[dataset_name]['type'],
                'features': self.builtin_datasets[dataset_name]['features']
            }
            logging.info(f"Cleaned dataset: {dataset_name}\n{cleaned_df.head()}\n")

        return self.cleaned_datasets

    def get_all_datasets(self):
        """
        Return all cleaned datasets in dictionary format.
        :return: Dictionary of all cleaned datasets.
        """
        return self.cleaned_datasets

    # Built-in dataset loaders
    def load_titanic_dataset(self):
        """Load the Titanic dataset from pydataset."""
        return data('titanic')

    def load_cancer_dataset(self):
        """Load the Breast Cancer dataset from scikit-learn."""
        data = load_breast_cancer(as_frame=True)
        return data.frame

    def load_synthetic_dataset(self):
        """Generate a synthetic dataset for classification."""
        X, y = make_classification(n_samples=100, n_features=10, random_state=42)
        return pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)]).assign(target=y)

    def load_diabetes_dataset(self):
        """Load the Diabetes dataset from scikit-learn."""
        return load_diabetes(as_frame=True).frame

    def load_iris_dataset(self):
        """Load the Iris dataset from seaborn."""
        return sns.load_dataset('iris')

    def load_tips_dataset(self):
        """Load the Tips dataset from seaborn."""
        return sns.load_dataset('tips')

    def load_flights_dataset(self):
        """Load the Flights dataset from seaborn."""
        return sns.load_dataset('flights')

    def load_mtcars_dataset(self):
        """Load the mtcars dataset from pydataset."""
        return data('mtcars')

    def load_diamonds_dataset(self):
        """Load the Diamonds dataset from pydataset."""
        return data('diamonds')

    def load_sleep_dataset(self):
        """Load the Sleep dataset from pydataset."""
        return data('sleep')

    def load_gene_expression_dataset(self):
        """Load a synthetic gene expression dataset."""
        genes = [f'gene_{i}' for i in range(100)]
        samples = [f'sample_{i}' for i in range(20)]
        data = np.random.rand(100, 20)  # Random gene expression values
        return pd.DataFrame(data, index=genes, columns=samples).reset_index().rename(columns={'index': 'gene'})

    def load_cell_cycle_dataset(self):
        """Load a synthetic cell cycle dataset."""
        cells = [f'cell_{i}' for i in range(50)]
        phases = ['G1', 'S', 'G2', 'M']
        data = np.random.choice(phases, size=50)  # Random cell cycle phases
        return pd.DataFrame({'cell': cells, 'phase': data})

    def load_protein_interaction_dataset(self):
        """Load a synthetic protein-protein interaction dataset."""
        proteins = [f'protein_{i}' for i in range(10)]
        interactions = np.random.randint(0, 2, size=(10, 10))  # Random binary interactions
        return pd.DataFrame(interactions, index=proteins, columns=proteins).reset_index().rename(columns={'index': 'protein'})

    def load_tcga_brca_dataset(self):
        """Load a synthetic TCGA BRCA dataset."""
        genes = [f'gene_{i}' for i in range(100)]
        patients = [f'patient_{i}' for i in range(50)]
        data = np.random.rand(100, 50)  # Random gene expression values
        return pd.DataFrame(data, index=genes, columns=patients).reset_index().rename(columns={'index': 'gene'})

# Example usage
if __name__ == "__main__":
    # Initialize the BioinformaticsDatasetCleaner class
    cleaner = BioinformaticsDatasetCleaner()

    # Process all built-in datasets
    cleaned_datasets = cleaner.process_all_datasets()

    # Get all cleaned datasets in dictionary format
    all_datasets = cleaner.get_all_datasets()

    # Print all datasets in dictionary format
    for dataset_name, dataset_info in all_datasets.items():
        print(f"Dataset: {dataset_name}")
        print(f"Type: {dataset_info['type']}")
        print(f"Features: {dataset_info['features']}")
        print(f"Data:\n{dataset_info['data'].head()}\n")

Dataset: titanic
Type: qualitative
Features: ['class', 'age', 'sex', 'survived']
Data:
         class     age    sex survived
1    1st class  adults    man      yes
58   1st class  adults    man       no
176  1st class  adults  women      yes
316  1st class  adults  women       no
320  1st class   child    man      yes

Dataset: cancer
Type: qualitative
Features: ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'target']
Data:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  \
0        17.99         10.38          122.80     1001.0          0.11840           0.27760   
1        20.57         17.77          132.90     1326.0          0.08474           0.07864   
2        19.69         21.25          130.00     1203.0          0.10960           0.15990   
3        11.42         20.38           77.58      386.1 