In [1]:
# Imports
import time
import pandas as pd
import numpy as np
import scipy
import json
import requests
import seaborn as sns
import matplotlib.pyplot as plt
# Plot settings for retina
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Filenames
f_name = 'single-proj_scenario.csv'
augmented_f_name = 'single-proj_scenario_with_text.csv'
out_f_name = 'pr-dataset.json'

In [3]:
# Create dataframe with PR data
def load_pr_data() -> pd.DataFrame:
    f_bots = 'bots.txt'
    with open(f_name) as f:
        raw_df = pd.read_csv(f_name)
    with open(f_bots) as f:
        bots = f.read().splitlines()
    return raw_df[~raw_df['Author'].isin(bots)]

In [4]:
raw_df = load_pr_data()

In [5]:
# Add empty "text" column
raw_df['Text'] = ''

In [6]:
# Hanle API tokens

class APIToken:
    def __init__(self):
        self.token_index = 0
        self.tokens = [
            'c8407e53938eea5e027377eb744efe67850f3e0a', # SA-2019
            '8657769a09149655d1942a975c074f6d9441da4a', # SA-2019
            '55866a3ccee6c1ee5e32727958687cd1ad29d55b'  # Pr_study_1
        ]
        
    def next(self):
        tok = self.tokens[self.token_index]
        self.token_index = (self.token_index + 1) % len(self.tokens)
        return tok

In [7]:
# Ask GitHub api for PR text

# Construct API url
def get_api_url(repo_url: str, pr_num: int) -> str:
    owner_repo = repo_url[len("https://github.com/"):]
    return f"https://api.github.com/repos/{owner_repo}/pulls/{pr_num}"

# Request pr and return PR title and body
def get_pr_text(full_url: str, tok: APIToken, retries: int) -> str:
    QUERY = { 'access_token': tok.next() }
    HEADERS = { 'Accept': "application/vnd.github.v3+json" }
    try:
        r = requests.get(url = full_url, params = QUERY, headers = HEADERS)
        if (r.status_code != 200):
            print(f"\tError in request: {r.text}")
            if ('Retry-After' in r.headers):
                print(f"\tRetrying after: {r.headers['Retry-After']}")
                time.sleep(r.headers['Retry-After'])
                return get_pr_text(full_url, tok, retries - 1)
            else:
                print(f"\tRate limit remaining: {r.headers['X-RateLimit-Remaining']}")
                return ''
        else:
            j = r.json()
            return j['title'] + '\n' + j['body']
    except ConnectionError:
        if (retries > 0):
            print(f"\tConnectionError in request for {repo_url}. Retrying ({retries} retries left).")
            time.sleep(10)
            return get_pr_text(full_url, tok, retries - 1)
        else:
            print(f"\tConnectionError in request for {repo_url}. Returning ''.")
            return ''
    except:
        print(f"\tUnknown error in request for {repo_url}. Returning ''.")
        return ''
    
def fetch_pr_texts(_df: pd.DataFrame) -> pd.DataFrame:
    # Request text for all PR in df, add retrieved text to df
    api_token = APIToken()
    for i in range(len(raw_df)):
        if (i > 4000):
            break
        row = raw_df.loc[i]
        repo_url = row['Repo']
        pr_num = row['PR_Number']
        URL = get_api_url(repo_url, pr_num)
        print(f"{i}: Requesting {URL}")
        text = get_pr_text(URL, api_token, retries = 5)
        raw_df.loc[i, 'Text'] = text
    raw_df.to_csv(augmented_f_name, index = False)
    return raw_df

In [8]:
def load_local_prs_with_text() -> pd.DataFrame:
    with open(augmented_f_name) as f:
        _df = pd.read_csv(f)
    return _df[~_df['Text'].isna()]

In [10]:
# Either load locally saved file, or re-load PR tests from GitHub

# df = fetch_pr_texts(raw_df)
df = load_local_prs_with_text()

In [None]:
def add_text_len(_df: pd.DataFrame) -> pd.DataFrame:
    return _df\
        .assign(TextLen=lambda d: d['Text'].str.len())

In [None]:
textlen_df = add_text_len(df)

In [None]:
textlen_df.head()

In [None]:
def textlen_distribution(_df: pd.DataFrame, title: str):
    plt.rcParams["figure.figsize"] = [16,10]
    sns.boxplot(orient="h", data=_df[['TextLen']]).set_title(title)
    
textlen_distribution(textlen_df, 'Distribution of raw PR text length')

In [None]:
def remove_outliers(_df: pd.DataFrame, use_z_score: bool = True) -> pd.DataFrame:
    if use_z_score:
        z_scores = scipy.stats.zscore(_df[['TextLen']])
        abs_z_scores = np.abs(z_scores)
        non_outlier = (abs_z_scores < 3).all(axis=1)
        return _df[non_outlier]
    else:
        Q1 = _df['TextLen'].quantile(0.25)
        Q3 = _df['TextLen'].quantile(0.75)
        IQR = Q3 - Q1
        outlier = (_df['TextLen'] < (Q1 - 1.5 * IQR)) | (_df['TextLen'] > (Q3 + 1.5 * IQR))
        return _df[~outlier]

In [None]:
def inspect_long_text(_df: pd.DataFrame, above: int = 2000):
    for i, t in _df[_df['TextLen'] > above].reset_index()['Text'].iteritems():
        print('_' * 100)
        print(t)
    
inspect_long_text(remove_outliers(textlen_df))

In [None]:
def remove_short_and_dirty_text(_df: pd.DataFrame) -> pd.DataFrame:
    min_len = 50
    dirty_text = ['Documentação', 'Motivação', ': java.lang.']
    is_text_dirty = _df['Text'].str.contains('|'.join(dirty_text))
    is_text_too_short = (_df['TextLen'] < min_len)
    return _df[~is_text_dirty & ~is_text_too_short].reset_index(drop=True)

In [None]:
clean_df = remove_short_and_dirty_text(remove_outliers(textlen_df, False))

In [None]:
inspect_long_text(clean_df, above=500)

In [None]:
textlen_distribution(clean_df, 'Distribution of PR text length (outliers removed)')

In [None]:
clean_df.describe()

In [None]:
# Write as JSON file
def write_json(_df: pd.DataFrame):
    _df.rename(columns={
        'Merged?' : 'IsMerged',
        'Senti4SDAvg-General' : 'Senti4SDAvgGeneral',
        'Senti4SDAvg-SourceCode' : 'Senti4SDAvgSourceCode',
        'SentiSSEAvg-General' : 'SentiSSEAvgGeneral',
        'SentiSSEAvg-SourceCode' : 'SentiSSEAvgSourceCode',
    }).to_json(out_f_name, orient='records')
    
write_json(clean_df)

In [None]:
clean_df.info()