# Analysis of readability of PR descriptions and acceptance time

## Imports and loading data

In [None]:
# Imports
import pandas as pd
import numpy as np
import scipy
import json
import seaborn as sns
import matplotlib.pyplot as plt
# Plot settings for retina
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
# Create dataframe with PR data
def load_data() -> pd.DataFrame:
    f_name = 'raw_data.json'
    with open(f_name) as f:
        data = json.load(f)
        df = pd.json_normalize(data)
    cols = ['_id', 'text', 'textLen', 'closingTime', 'isMerged', 'FRE', 'FKG', 'DCS', 'PRDS']
    df['textLen'] = df['text'].str.len()
    return df.rename(columns = {
        'readability.daleChallScore': 'DCS',
        'readability.fleschKincaidGradeLevel': 'FKG',
        'readability.fleschReadingEase': 'FRE',
        'readability.pullRequestDomainScore': 'PRDS'
    })[cols]

In [None]:
df = load_data()
df

In [None]:
def remove_low_values(_df: pd.DataFrame) -> pd.DataFrame:
    low_text_len = _df['textLen'] < 50
    low_closing_time = _df['closingTime'] < 30
    return _df[~low_text_len & ~low_closing_time].reset_index(drop=True)

In [None]:
def remove_non_merged(_df: pd.DataFrame) -> pd.DataFrame:
    return _df[_df['isMerged']].reset_index(drop=True)

In [None]:
clean_df = remove_non_merged(remove_low_values(df))

In [None]:
clean_df.head()

In [None]:
clean_df.describe()

In [None]:
def distribution_of_col(_df: pd.DataFrame, col: str, fname: str = None):
    plt.rcParams["figure.figsize"] = [16,5]
    sns.boxplot(orient="h", data=_df[col]).set_title(f"Distribution of {col}", fontsize=15)
    if fname:
        plt.savefig(f'figures/{fname}', bbox_inches='tight')
    plt.show()

In [None]:
distribution_of_col(clean_df, 'FRE', 'fre-distribution.png')

In [None]:
distribution_of_col(clean_df, ['FKG', 'DCS', 'PRDS'], 'fkg-dcs-prds-distribution.png')

In [None]:
# Scatterplot of closingTime and textLen
def scatterplot(_df: pd.DataFrame, col1: str, col2: str, fname: str = None):
    plt.rcParams["figure.figsize"] = [16,7]
    fig = plt.figure() # create figure
    rect = 0,0,1,1 # create an rectangle for the new axis
    log_ax = fig.add_axes(rect) # create a new axis (or use an existing one)
    log_ax.set_xscale("log") # log first
    ax = sns.scatterplot(x=col1, y=col2, data=_df, ax = log_ax)
    ax.set_title(f'Relationship between {col1} and {col2}', fontsize=20)
    if fname:
        plt.savefig(f'figures/{fname}', bbox_inches='tight')
    plt.show()

In [None]:
scatterplot(clean_df, 'closingTime', 'FRE', 'prs-closingtime-fre.png')

In [None]:
scatterplot(clean_df, 'closingTime', 'FKG', 'prs-closingtime-fkg.png')

In [None]:
scatterplot(clean_df, 'closingTime', 'DCS', 'prs-closingtime-dcs.png')

In [None]:
scatterplot(clean_df, 'closingTime', 'PRDS', 'prs-closingtime-prds.png')