In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline
import seaborn as sns

In [2]:
df_train = pd.read_csv('dataset_train.csv', index_col='Index')

In [7]:
df_train.describe()

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
count,1566.0,1568.0,1567.0,1569.0,1561.0,1565.0,1565.0,1557.0,1566.0,1570.0,1560.0,1600.0,1600.0
mean,49634.570243,39.797131,1.14102,-0.387863,3.15391,-224.589915,495.74797,2.963095,1030.096946,5.950373,-0.053427,-243.374409,21.958012
std,16679.806036,520.298268,5.219682,5.212794,4.155301,486.34484,106.285165,4.425775,44.125116,3.147854,0.971457,8.78364,97.631602
min,-24370.0,-966.740546,-10.295663,-10.162119,-8.727,-1086.496835,283.869609,-8.858993,906.62732,-4.697484,-3.313676,-261.04892,-181.47
25%,38511.5,-489.551387,-4.308182,-5.259095,3.099,-577.580096,397.511047,2.218653,1026.209993,3.646785,-0.671606,-250.6526,-41.87
50%,49013.5,260.289446,3.469012,-2.589342,4.624,-419.164294,463.918305,4.378176,1045.506996,5.874837,-0.044811,-244.867765,-2.515
75%,60811.25,524.771949,5.419183,4.90468,5.667,254.994857,597.49223,5.825242,1058.43641,8.248173,0.589919,-232.552305,50.56
max,104956.0,1016.21194,11.612895,9.667405,10.032,1092.388611,745.39622,11.889713,1098.958201,13.536762,3.056546,-225.42814,279.07


#### Description class

In [6]:
# TODO: Implement percentiles

class Description:
    indexes = ['count',
#                'nan', 'Non nan',
               'mean', 'std', 'min', '25%', '50%', '75%', 'max']
    
    def __init__(self, df: pd.DataFrame):
        self.dataframe = df.copy()
        self.numeric_df = self.dataframe._get_numeric_data()
        self.clean_df = self.numeric_df.dropna(how='any', inplace=False)
        self.numeric_features = list(self.numeric_df.columns)

    def describe(self) -> pd.DataFrame:
        descrtiption_df = pd.DataFrame(index=self.indexes, columns=self.numeric_features)
        # MAKE NAN as bonus
        descrtiption_df.loc['count', :] = self._count()
        descrtiption_df.loc['mean', :] = self._mean()
        descrtiption_df.loc['std', :] = self._std()
        descrtiption_df.loc['min', :] = self._min()
        descrtiption_df.loc['max', :] = self._max()
        return descrtiption_df
    
    @staticmethod
    def __filter_nan(df, feature) -> pd.Series:
        not_null_indexes = df[feature].notnull()
        filtered_series = df[feature][not_null_indexes]
        return filtered_series
        
    def _count(self):
        data = dict()
        for feature in self.numeric_features:
            not_null_values = self.numeric_df[feature].notnull()
            data[feature] = len(self.numeric_df[not_null_values])
        return pd.Series(data=data)
    
    def _mean(self):
        data = dict()
        for feature in self.numeric_features:
            filtered_series = self.__filter_nan(self.numeric_df, feature)
            data[feature] = filtered_series.sum() / len(filtered_series)
        return pd.Series(data=data)

    @staticmethod
    def __count_std(values: np.ndarray, mean):
        std = (np.sum((values - mean) ** 2) / len(values)) ** 0.5
        return std
    
    @staticmethod
    def __count__min_max(values: np.ndarray) -> tuple:
        values = values.copy()
        values.sort()
        _min, *_, _max = values
        return _min, _max

    def _std(self):
        mean_series = self._mean()
        data = dict()
        for feature in self.numeric_features:
            filtered_series = self.__filter_nan(self.numeric_df, feature)
            data[feature] = self.__count_std(filtered_series.values, mean_series[feature])
        return pd.Series(data=data)
    
    # TODO: Make _min, _max in one method
    def _min(self):
        data = dict()
        for feature in self.numeric_features:
            filtered_series = self.__filter_nan(self.numeric_df, feature)
            data[feature], _ = self.__count__min_max(filtered_series.values)
        return pd.Series(data=data)
    
    def _max(self):
        data = dict()
        for feature in self.numeric_features:
            filtered_series = self.__filter_nan(self.numeric_df, feature)
            _, data[feature] = self.__count__min_max(filtered_series.values)
        return pd.Series(data=data)

In [4]:
description = Description(df_train)
description.describe()

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
count,1566.0,1568.0,1567.0,1569.0,1561.0,1565.0,1565.0,1557.0,1566.0,1570.0,1560.0,1600.0,1600.0
mean,49634.6,39.7971,1.14102,-0.387863,3.15391,-224.59,495.748,2.96309,1030.1,5.95037,-0.0534271,-243.374,21.958
std,16674.5,520.132,5.21802,5.21113,4.15397,486.189,106.251,4.42435,44.111,3.14685,0.971146,8.78089,97.6011
min,-24370.0,-966.741,-10.2957,-10.1621,-8.727,-1086.5,283.87,-8.85899,906.627,-4.69748,-3.31368,-261.049,-181.47
25%,,,,,,,,,,,,,
50%,,,,,,,,,,,,,
75%,,,,,,,,,,,,,
max,104956.0,1016.21,11.6129,9.66741,10.032,1092.39,745.396,11.8897,1098.96,13.5368,3.05655,-225.428,279.07
