# Machine Learning task: Predicting response to cancer immunotherapy
by: __Pawel Rosikiewicz__ www.SimpleAI.ch   
provided by: __Ardigen__  

## Notebook purpose: Setup working enviroment
* it contains set of standard dunctions that I am using to organize my files in the project

In [4]:
# imports,
import os # allow changing, and navigating files and folders, 
import sys
import re # module to use regular expressions, 
import glob # lists names in folders that match Unix shell patterns
import pandas as pd

In [7]:
a = ""
pd.isnull(a)

False

In [None]:
# basedir
basedir = os.path.dirname(os.getcwd())
os.chdir(basedir)
sys.path.append(basedir)
print(basedir) # shoudl be ../myproject/

In [None]:
# create folders holing different types of data por notebooks, 
files_to_create = {
          "bin": os.path.join(basedir, "bin"),
          "dev notes": os.path.join(basedir, "notes"),
          # ....
          "for jupyter notebooks": os.path.join(basedir, "notebooks"),
          "to store models, pretrained NNs etc. used in the project": os.path.join(basedir, "models"),
          # ...
          "for tools in .py format": os.path.join(basedir, "src"),
          "custom funcitons and classes created for the project": os.path.join(basedir, "src/utils"),
          "config files": os.path.join(basedir, "src/configs"),
          # ....
          "to store data and resuls": os.path.join(basedir, "data"),
          "raw data from external providers": os.path.join(basedir, "data/raw"),
          "intermediate data": os.path.join(basedir, "data/interim"),
          "for final results, predictions, summaries, reports, etc..": os.path.join(basedir, "data/results")
      }

In [None]:
# create file structure for the project
for file_function in list(files_to_create.keys()):
    try:
        os.mkdir(files_to_create[file_function])
    except:
        print("file", file_function, " - - - was already created")

In [1]:
# Function, ......................... 
def annotated_barplot(*, 
    # input data
    data_examples, 
    top_val_perc,                                                                        
    df_filter, 
    
    # plot aestetics
    plot_title="", 
    fig_size=(12,12), 
    fontsize_scale=1,
    examples_fontsize_scale=1,
    group_size=5,
    barplot_cmap="tab10",
    cmap_from=0, 
    cmap_to=0.5,
    adjust_top=0.8
):
    
    
    '''
        Generates bar plot used to get fast information on data 
        in different column in large df
        
        Parameters/Input              
        _________________   _______________________________________________________________________________ 
        
        . Input .
        * data_examples     DataFrame with large Dataframe summary, 
                            generated with  summarize_data_and_give_examples()
        * top_val_perc      DataFrame with % of the top three or most frequence records in each column 
                            in large dataframe that was summarized with summarize_data_and_give_examples()
        
        .
        * df_filter         list, with True/False for each row in data_examples & top_val_perc 
                            if True, the row will be displayed on barplot
                            
        * fig_size          tuple, (row lenght, col lenght), in inches
        * font_size         int, size of all fonts used on the plot
        * group_size        int, how many rows will be diplayes as group on y axis on horizonal barplot.
                            groups are divided by space == to one bar.
                            
        Returns             
        _________________   _______________________________________________________________________________
        
        * plt.figure 
    '''
    
    # basic fontsize:
    font_size=8
    
    # helper,
    def stacked_barh_one_level(*, f_ax, bar_pos, top, bottom, colors, edgecolor, labels):
        f_ax.barh(bar_pos, top,left=bottom, color=colors, edgecolor=edgecolor, label=labels, linewidth=0.5, height=0.6)
        return f_ax    
    
    
    # ............................................................
    # Names and group filtering

    # group names,
    group_names       = list(data_examples.name.loc[df_filter])

    # data for plot,
    data_completness  = 100-np.array(data_examples.NaN_perc[df_filter]).flatten()
    tick_description  = data_examples.name[df_filter]
    top_values        = top_val_perc.values[df_filter, :]
    top_data_examples = data_examples.examples[df_filter]
    group_description = data_examples.summary[df_filter]

    # rescale top values,so they are part of non-missing data
    for i in range(top_values.shape[1]):
        v = top_values[:,i]
        top_values[:,i] = (v*data_completness)/100
    all_remaining_values = data_completness-top_values.sum(axis=1)

    # join the data in one array, I had some problems here, 
    data_for_plot = np.round(np.c_[(np.round(top_values,1), all_remaining_values)],1)

    

    # ............................................................
    # order the bars,

    # find order of the bars, based on data completness,
    bar_order = np.arange(data_completness.shape[0])

    # add spaces between everyx n-th bar, 
    add_spacers = True
    if add_spacers==True:
        # add spaces between everyx 5th bar, 
        space_between_groups = 1
        new_br = bar_order.copy().flatten()
        group_top  = []
        group_bottom = []

        for i, j in enumerate(sorted(list(bar_order))):

            if i==0: 
                add_to_list, counter = 0, 0
                group_bottom.append(j)

            if i>0 and counter<group_size: 
                counter +=1       

            if counter==group_size:
                group_bottom.append(j+add_to_list+1)
                counter=0
                add_to_list +=space_between_groups; 

            new_br[bar_order==j]=j+add_to_list

        group_top = [x+group_size-1 for x in group_bottom]    
        group_top[-1] = np.max(bar_order)+add_to_list
        bar_order = new_br.copy()

        
        
    # ............................................................
    # barplot parameters; this was just to help me in long function !
    numeric_data_for_plot = data_for_plot # np array, 
    top_data_examples     = top_data_examples
    bar_position          = bar_order + 1
    group_description     = group_description
    bar_related_fontsize  = font_size
    
    # bar_names, (ytick labels),
    # if len(list(data_examples.dtype[df_filter].unique()))>1:
    df_bar_names       = pd.DataFrame({"col_1":group_names, "col_2":list(data_examples.dtype[df_filter])}) 
    df_bar_names.col_2 = df_bar_names.col_2.str.pad(width=20, side="left", fillchar=".")
    bar_names          = list(df_bar_names.col_1.str.cat([", "]*df_bar_names.shape[0]).str.cat(df_bar_names.col_2))
    #else:
    #    bar_names          = group_names # list, old script, now chnaged as in below
        
  

    # ............................................................   
    # barplot,

    #### prepare data and figure, 
    
    # Set style and colors,
    plt.style.use("classic")
    bar_colors = plt.get_cmap(barplot_cmap)(np.linspace(cmap_from, cmap_to, data_for_plot.shape[1])) # different nr of colors,
    edge_colors = bar_colors.copy()
    bar_colors[-1,0:3] = colors.to_rgb("lightgrey")
    edge_colors[-1,0:3] = colors.to_rgb("grey")

    # fig
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=fig_size, facecolor="white")
    fig.suptitle(plot_title)
    plot_top_value = np.max(bar_position)+15
    ax.set_ylim(0,plot_top_value)
    ax.set_xlim(0,300)

    # add top values as % od data completness,
    counter =[]
    add_top_values=True
    if add_top_values==True:
        counter = 0
        for i in list(range(data_for_plot.shape[1]))[::-1]:
            if counter == 0:
                bar_start = [0]*data_for_plot.shape[0]    
                bar_end   = data_for_plot[:,i]
            else:
                bar_start = bar_start+bar_end           
                bar_end   = data_for_plot[:,i] # bar end is hot tall is an individual bar, segment, not top point on a graph
            counter+=1

            # plot level on stacked plot
            ax = stacked_barh_one_level(
                 f_ax=ax, 
                 bar_pos=bar_position, top=bar_end, bottom=bar_start, 
                 colors=bar_colors[i], edgecolor=bar_colors[i], labels="test",
                 )


            
    #### ad legend on y axis        

    # Add ticks on y axis, and names for each bar,
    ax.set_yticks(bar_position)
    ax.set_yticklabels(bar_names, fontsize=bar_related_fontsize*fontsize_scale, color="black")
    ax.set_xticks([0, 25,50,75,100])
    ax.set_xticklabels(["0%", "25%", "50%", "75%", "100%"], fontsize=bar_related_fontsize*fontsize_scale, color="black")

    # Format ticks,
    ax.tick_params(axis='x', colors='black', direction='out', length=4, width=2) # tick only
    ax.tick_params(axis='y', colors='black', direction='out', length=4, width=2) # tick only    
    ax.yaxis.set_ticks_position('left')# shows only that
    ax.xaxis.set_ticks_position('bottom')# shows only that

    # Remove ticks, and axes that you dot'n want, format the other ones,
    ax.spines['top'].set_visible(False) # remove ...
    ax.spines['right'].set_visible(False) # remove ...  
    ax.spines['bottom'].set_linewidth(2) # x axis width
    ax.spines['bottom'].set_bounds(0,100) # Now the x axis do not go under the legend
    ax.spines['left'].set_linewidth(2) # y axis width 

    # Add vertical lines from grid,
    ax.xaxis.grid(color='grey', linestyle='--', linewidth=1) # horizontal lines

    # add patch on top to remove surplus gridlines
    x_left      = -100
    rect_width  = 500
    y_bottom    = np.max(bar_position)+1.4 # add a bit to nut cut text opr boxes,
    rect_height = 500
    rect = mpatches.Rectangle(
        xy=(x_left,y_bottom),
        width=rect_width,
        height=rect_height,
        linewidth=0,
        edgecolor='white',
        facecolor='white',
        alpha=1, 
        zorder=10
    )
    ax.add_patch(rect)
    plt.ylim(top=np.max(bar_position)+1.4)

    # axes desciption    
    ax.set_xlabel(f"Percentage of non-missing data, rows in total={data_examples.nr_of_all_rows_in_original_df.iloc[0]}                           ", ha="right") # I intentionally, added these spaces here!
    ax.set_ylabel("Column name, datatype", ha="center")


    
    #### add, numbers and examples on a left side of the barplot, 

    # add rectagles arrnoud examples
    for i, j in zip(group_bottom, group_top):
        x_left      = 113
        rect_width  = 186
        y_bottom    = i+0.2
        rect_height = j-i+1.5
        rect = mpatches.Rectangle(
            xy=(x_left,y_bottom),
            width=rect_width,
            height=rect_height,
            linewidth=1,
            edgecolor="darkgreen",
            facecolor='yellow',
            alpha=0.3
        )
        ax.add_patch(rect)

    # add text with data completness above each bar,
    add_text_wiht_data_completness_above_each_bar=True
    if add_text_wiht_data_completness_above_each_bar==True:
        for i in range(numeric_data_for_plot.shape[0]):
            text_y_position = bar_position[i]-0.3
            text_x_position = numeric_data_for_plot.sum(axis=1).tolist()[i]+2

            # text,
            text_to_display = "".join([str(int(np.round(numeric_data_for_plot.sum(axis=1).tolist()[i],0))),"%"])
            t = ax.text(text_x_position, text_y_position,  text_to_display, fontsize=bar_related_fontsize*fontsize_scale, color="darkred")
            #t.set_bbox(dict(facecolor="white", alpha=0.3, edgecolor="white"))
    else: 
        pass

    # Add table, with data to plot,
    for i in range(numeric_data_for_plot.shape[0]):
        text_y_position = bar_position[i]-0.3
        text_x_position = 115

        # text,
        text_to_display = list(group_description)[i]
        t = ax.text(text_x_position, text_y_position,  text_to_display, fontsize=bar_related_fontsize*fontsize_scale, color="black")

    # add examples,   
    for i in range(numeric_data_for_plot.shape[0]):
        text_y_position = bar_position[i]-0.3
        text_x_position = 170

        # text,
        if re.search("all nonnull",str(list(group_description)[i])):
            text_to_display = "".join(["- - - > ",list(top_data_examples)[i]])
        else: text_to_display = list(top_data_examples)[i]
        t = ax.text(text_x_position, text_y_position,  text_to_display, fontsize=(bar_related_fontsize)*fontsize_scale*examples_fontsize_scale, color="black")

        
        
    #### add plot legend  
    box_color       = "yellowgreen"
    box_edge_color  = "darkgreen"
    text_color      = "black" 
    text_size       = bar_related_fontsize

    text_x_position = 3
    text_y_position = np.max(bar_position)+2.5
    text_to_display = '''BAR DESCRIPTION\n- each bar shows % of non-missing data in a given columns\n- Colour bars on top, shows the % of the most frequent classes'''   
    t = ax.text(text_x_position, text_y_position,  text_to_display, fontsize=text_size*fontsize_scale, color=text_color, ha="left")
    t.set_bbox(dict(facecolor=box_color, alpha=1, edgecolor=box_edge_color))

    text_x_position = 115
    text_y_position = np.max(bar_position)+2.5
    text_to_display = '''FEATURE SUMMARY \n - numeric.: min; mean; max \n - string/time: nr of classes'''
    t = ax.text(text_x_position, text_y_position,  text_to_display, fontsize=text_size*fontsize_scale, color=text_color, ha="left")
    t.set_bbox(dict(facecolor=box_color, alpha=1, edgecolor=box_edge_color))

    text_x_position = 175
    text_y_position = np.max(bar_position)+2.5
    text_to_display = '''EXAMPLES of the most Frequent Non-Missing Values:\n - first: %of rows, with a given class, \n - second: class value, or the first 15-th characters'''
    t = ax.text(text_x_position, text_y_position,  text_to_display, fontsize=text_size*fontsize_scale, color=text_color, ha="left")
    t.set_bbox(dict(facecolor=box_color, alpha=1, edgecolor=box_edge_color))
    
    fig.subplots_adjust(top=adjust_top)
    plt.show();
    


 
    
# Function, ............................................................................    
def df_summary_plot(*, 
    # input data (either df or data_examples & top_values_perc)
    df=None,
    df_top_n=3,
    data_examples=None, 
    top_values_perc=None, 
    
    # options on dat to display, 
    groups_to_display=None, 
    barPlot=True, # legacy term, 
                          
    # settings, 
    barPlot_figsize=None,
    barPlot_groupSize=None,
    barPlot_dct=dict(),
    verbose=False      
               
):
 
    """
        Plots Pie chart, table and barplot summarizing data in large dataFrame
        
        Parameters/Input              
        _________________   _______________________________________________________________________________  
        
        . Input .
        
        * data_examples     DataFrame with large Dataframe summary, 
                            generated with  summarize_data_and_give_examples()
        * top_val_perc      DataFrame with % of the top three or most frequence records in each column 
                            in large dataframe that was summarized with summarize_data_and_give_examples()
        * groups_to_display str, or list with strings, {"all", "text", "numeric", "datetime"}
                            "all", (default), or one of the dtypes, in data_examples.dtype, 
                            or list with different dtypes that will be ploted on rseraprate barplots
                            Columns only with missing data are not included in groups, these are plotted
                            only with "all" default option
                
        . Parameters . 
        
        * start_figure_numbers_at 
                            >=1, how nto start numeration of the figures with plots
        * pieChart          if True (default), display Pie chart with dtypes detected in data_examples
                            with number start_figure_numbers_at 
        * showTable         if True (default), display image of a summary table
                            with number start_figure_numbers_at + 1
        * barPlot.          if True (default), displays
                            with number start_figure_numbers_at  + 2, 3,4 and so on for each dtype
                            
                            
        Returns               
        _________________   _______________________________________________________________________________
        
         * BarPlot           by :   barplot_with_data_completness_class_description_and_top_value_examples()
        
    """    

    # set up groups to be selected 
    if groups_to_display==None:
        groups_to_display="all"
    elif isinstance(groups_to_display, str):
        if groups_to_display=="all":
            pass
        else:
            groups_to_display=[groups_to_display]
    else:
        pass
    
    # prepare data directly from datafram, or use summary df elements caluated separately, 
    if data_examples is None or top_values_perc is None: 
        data_examples, _, top_values_perc = summarize_df(  
            df = df, 
            nr_of_examples_per_category = df_top_n,
            csv_file_name = None, 
            save_dir = None,
            verbose=verbose
        )
    else:
        pass

 
    # Pie chart with nr of col with different dtypes in data df,
    if barPlot==True:
        
        # .. barplot for each column with any non-missing data, eacg dtype is plotted separately,
        if groups_to_display=="all": 
            groups_to_display=["all"]; add_all_groups=True
        else: 
            add_all_groups=False
        
        
        for i, group_name in enumerate(groups_to_display): 
            
            # filter the data, and plot title, 
            if add_all_groups:
                df_filter         = pd.Series([True]*data_examples.shape[0])
            else:
                df_filter         = data_examples['dtype']==group_name

                
            # test, if the given group was present:  
            if df_filter.sum()==0:
                if verbose==True:
                    print("- - -THERE WERE NO COLUMNS WITH THAT DATA TYPE IN SEARCHED DataFrame - - -", end="\n\n")
                else:
                    pass
            else:
                # set size for the figure with barplot, 
                if df_filter.sum()>0 and df_filter.sum()<=10:
                    figSize = (12,5)
                    groupSize = df_filter.sum()

                elif df_filter.sum()>10:
                    figSize = (16,16)
                    groupSize = 5  
                 
                elif df_filter.sum()>50:
                    figSize = (12,22)
                    groupSize = 8  
                    
                # replace values in case 
                if barPlot_figsize!=None:
                    figSize= barPlot_figsize
                else:
                    pass
                if barPlot_groupSize!=None:
                    groupSize = barPlot_groupSize
                else:
                    pass
                
                    
                ##    
                annotated_barplot(
                    data_examples = data_examples, 
                    top_val_perc  = top_values_perc, 
                    df_filter     = df_filter, 
                    fig_size      = figSize,
                    group_size    = groupSize,
                    **barPlot_dct
                )

            # example
            #groups_to_display = ['text', 'numeric', 'datetime']
            # plot_summary_pie_chart_and_bar_plots(data_examples=data_examples, top_val_perc=top_val_perc, start_figure_numbers_at=4, pieChart=True, showTable=True, groups_to_display = ['text', 'numeric', 'datetime'])