In [None]:
# Dataset overview function
def check_df(dataframe, head=5):
    print("######################### SHAPE #########################")
    print(dataframe.shape)
    print("######################### DTYPES #########################")
    print(dataframe.dtypes)
    print("######################### NULL VALUES #########################")
    print(dataframe.isnull().sum())
    print("######################### INFO #########################")
    print(dataframe.info())
    print("######################### HEAD #########################")
    print(dataframe.head(head))
    print("######################### TAIL #########################")
    print(dataframe.tail(head))
    print("######################### DESCRIBE #########################")
    print(dataframe.describe().T)
    
check_df(dataframe)

In [None]:
# Check for unique values function
def check_uniques(dataframe):
    for col in dataframe.columns:
        print(' '*20)
        print(col.upper())
        print(dataframe[col].unique())
check_uniques(dataframe)

In [None]:
# Plot histogram function
def view_histogram(dataframe, columns = None):    
    num_columns = dataframe.select_dtypes(include=['int64','float64']).columns if columns is None else \
    columns
    
    for col in num_columns:    
        mean_value = dataframe[col]. mean()
        
        plt.figure(figsize=(5,5))
        plt.grid(True)
        plt.title(dataframe[col].name)
        sns.histplot(dataframe[col], kde=True)
        plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2, label='Mean')
        plt.legend()
        plt.show()
        
        print('Min:', dataframe[col].min())
        print('Max:', dataframe[col].max())
        print('Mean:', mean_value)
        print('Std:', dataframe[col].std())
        print('Skewness:', dataframe[col].skew())
        print('Kurtosis', dataframe[col].kurt())
        
view_histogram(dataframe)

In [None]:
# Plot countplot function
def show_countplot(dataframe, feature, labels_rotation = 0):
    plt.figure(figsize=(15,4))
    total = float(len(dataframe))
    countplot = sns.countplot(x=feature, data=dataframe)
    for p in countplot.patches:
        height = p.get_height()
        countplot.text(p.get_x()+p.get_width()/2.,
                height + 3,'{:1.2f}'.format((height/total)*100),ha="center")
    plt.xticks(rotation = labels_rotation)

In [None]:
# Grouping numerical feature and creating dataframe with categorical feature 
def create_group_dataframe(dataframe, feature1, feature2, bins, labels):
    
    feature_dict = {feature1: []}
    
    # Add keys to ages_dict
    for x in dataframe[feature2].unique():
        if x not in feature_dict.keys():
            feature_dict[x] = []
    
    # Appendig values in ages_dict keys
    for i in range(len(labels)):
        down_limit_age, upper_limit_age = bins[i], bins[i+1]
        label = labels[i]
        feature_dict[feature1].append(label)
        
        # Filtering and counting needed data from the dataframe
        for column in dataframe[feature2].unique():
            group_count = len(dataframe.loc[(dataframe[feature1]>down_limit_age) & (dataframe[feature1]<=upper_limit_age) & \
                                                       (dataframe[feature2] ==column)])
            #Append data to ages_dict
            feature_dict[column].append(group_count)
    
    # Transform ages_dict to dataframe
    feature_dict = pd.DataFrame(feature_dict)
    feature_dict.columns = [col.replace('-', '_') for col in feature_dict.columns]
    
    return feature_dict 

In [None]:
# Shows percentages barplot for two categorical features 
def show_cat_features_barplot(dataframe, feature1, feature2, figsize = (10, 6), fontsize = 9):
    pivot_table = pd.crosstab(dataframe[feature1], dataframe[feature2])

    pivot_table_long = pivot_table.reset_index().melt(id_vars=feature1, var_name=feature2, value_name='count')

    total_counts = pivot_table_long.groupby(feature1)['count'].transform('sum')

    def generate_percentages():

        for percentage in (pivot_table_long['count'] / total_counts) * 100:
            yield percentage

    percentages_iter = generate_percentages()

    plt.figure(figsize=(20, 10))
    ax = sns.barplot(x=feature1, y='count', hue=feature2, data=pivot_table_long, palette='muted', ci=None)


    for p in ax.patches:
        width = p.get_width()
        height = p.get_height()
        x, y = p.get_xy() 
        # print(width, height, x, y)
        ax.annotate(f'{next(percentages_iter):.2f}%', (x + width/2, y + height + 1), ha='center', fontsize = fontsize)


    plt.legend(title=feature2, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title(f'{feature2} distribution by {feature1}')
    plt.xlabel(feature1)
    plt.ylabel('Count')


    plt.show()

In [None]:
def create_scatterplot(dataframe, column):    
    fig, ax = plt.subplots(figsize=(7, 7))

    for i, category in enumerate(dataframe.columns.unique().drop(column)):
        sns.lineplot(x=column, y=category, data=dataframe, marker='o', markersize=8, label=category, palette=sns.color_palette("husl", len(dataframe.columns.unique().drop(column))))
        sns.scatterplot(x=column, y=category, data=dataframe, palette=sns.color_palette("husl", len(dataframe.columns.unique().drop(column))))

    ax.set_ylabel('Count')
    ax.set_title('Count by Category')
    plt.xticks(rotation=45)
    ax.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.tight_layout()
    plt.show()

In [None]:
def print_percentages(dataframe, column):    
    print(f'Percentage Ratios by {column}')
    total_people = dataframe.iloc[:, 1:].sum(axis=1)
    percentage_by_group = dataframe.iloc[:, 1:].div(total_people, axis=0) * 100
    percentage_by_group.insert(0, column, dataframe[column])
    print(percentage_by_group)