# N-grams

Can we run the ngram code on the newly scraped projects, and what kind of results do we get. 

In [23]:
%load_ext autoreload
%autoreload 2

from pymongo import MongoClient

mongo_client = MongoClient()

database = mongo_client["graduation"]

pull_requests_collection = database["pull_requests"]

projects_collection = database["projects"]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import analysis_utilities

projects = list(projects_collection.find({'succeeded' : True, 'status_travis_date' : {"$exists" : True}}))

print("Found {} projects with a travis build status date".format(len(projects)))

Found 75 projects with a travis build status date


In [25]:
import ngramizer
from collections import Counter

def is_bot_comment(comment):
    BOT_NAMES = ["coveralls", "codecov-io", "slnode", "pep8speaks", "rh-atomic-bot", "cesium-concierge", "azurecla", "greenkeeperio-bot", "msftclas"]
    
    return comment["user"]["login"] in BOT_NAMES


In [26]:
ngram_length= 2

for project in projects[:15]:    
    prs = list(pull_requests_collection.find({'project_name': project["full_name"].split("/")[1],'project_owner': project["full_name"].split("/")[0]}))
    
    prs_before, prs_after = analysis_utilities.split_prs_on_build_date(project, prs, True)
    
    if len(prs_before) < 100 or len(prs_after) < 100:
        continue
        
    usernames = []
    
    for pr in prs_before:
        
        usernames.extend(ngramizer.given_text_extract_usernames(pr["body"]))
        
        usernames.append(pr["user"]["login"])
        
        for comment in pr["raw_comments"]:
            usernames.extend(ngramizer.given_text_extract_usernames(comment["body"]))
            usernames.append(comment["user"]["login"])
            
    for pr in prs_after:
        
        usernames.append(pr["user"]["login"])
        
        usernames.extend(ngramizer.given_text_extract_usernames(pr["body"]))
        
        for comment in pr["raw_comments"]:
            usernames.append(comment["user"]["login"])
            usernames.extend(ngramizer.given_text_extract_usernames(comment["body"]))
            
    usernames = list(set(usernames))
            
    project["usernames"] = usernames
        
    counter_pr_before = Counter()
    counter_pr_after = Counter()
    
    linkback_pr_before = {}
    linkback_pr_after = {}
    
    for pr in prs_before:
        if not ngramizer.is_bot_comment(pr["user"]["login"]):
            ngramizer.add_text_ngrams_to_counter(pr["body"], pr.get("html_url", ""), ngram_length, counter_pr_before, linkback_pr_before, project["usernames"])
        
        for comment in pr["raw_comments"]:
            if not ngramizer.is_bot_comment(comment["user"]["login"]):
                ngramizer.add_text_ngrams_to_counter(comment["body"], comment.get("html_url", ""), ngram_length, counter_pr_before, linkback_pr_before, project["usernames"])
            
    for pr in prs_after:
        if not ngramizer.is_bot_comment(pr["user"]["login"]):
            ngramizer.add_text_ngrams_to_counter(pr["body"], pr.get("html_url", ""), ngram_length, counter_pr_after, linkback_pr_after, project["usernames"])
        
        for comment in pr["raw_comments"]:
            if not ngramizer.is_bot_comment(comment["user"]["login"]):
                ngramizer.add_text_ngrams_to_counter(comment["body"], comment.get("html_url", ""), ngram_length, counter_pr_after, linkback_pr_after, project["usernames"])
                
    project["counter_pr_before"] = counter_pr_before
    project["counter_pr_after"] = counter_pr_after
    
    project["linkback_pr_before"] = linkback_pr_before
    project["linkback_pr_after"] = linkback_pr_after

In [18]:
for project in projects[:15]:
    
    if "counter_pr_before" not in project:
        continue
        
    print("{}".format(project["full_name"]))
    print("\tBefore:")
    
    total_before = len(project["counter_pr_before"].values())
    total_after = len(project["counter_pr_after"].values())
    
    for item in project["counter_pr_before"].most_common(10):
        print("{} - {:.0f}/{:.3f}".format(item[0], item[1], item[1] / total_before * 100))
        
    print("\tAfter:")
    for item in project["counter_pr_after"].most_common(10):
        print("{} - {:.0f}/{:.3f}".format(item[0], item[1], item[1] / total_after * 100))

Leaflet/Leaflet
	Before:
('M_USERNAME', 'M_USERNAME') - 147/1.088
('M_ICODE', 'M_ICODE') - 46/0.341
('use', 'case') - 25/0.185
('call', 'M_ICODE') - 19/0.141
('zoom', 'level') - 18/0.133
('think', 'us') - 17/0.126
('look', 'like') - 16/0.118
('someth', 'like') - 15/0.111
('seem', 'like') - 15/0.111
('layer', 'control') - 13/0.096
	After:
('M_USERNAME', 'M_USERNAME') - 345/3.305
('M_ICODE', 'M_ICODE') - 81/0.776
('use', 'M_ICODE') - 30/0.287
('x', 'M_ICODE') - 24/0.230
('look', 'like') - 23/0.220
('M_ICODE', 'method') - 16/0.153
('make', 'sure') - 16/0.153
('M_ICODE', 'event') - 16/0.153
('M_USERNAME', 'import') - 15/0.144
('call', 'M_ICODE') - 14/0.134
RIOT-OS/RIOT
	Before:
('M_USERNAME', 'M_MENTION') - 578/2.783
('M_USERNAME', 'M_USERNAME') - 302/1.454
('M_ICODE', 'M_ICODE') - 203/0.977
('M_USERNAME', 'M_ICODE') - 104/0.501
('M_ICODE', 'M_USERNAME') - 57/0.274
('M_URL', 'M_USERNAME') - 53/0.255
('use', 'M_ICODE') - 53/0.255
('M_MENTION', 'M_USERNAME') - 45/0.217
('M_MENTION', 'pleas')

In [19]:
for project in projects[:15]:
    
    if "counter_pr_before" not in project:
        continue
    
    counter_pr_before = project["counter_pr_before"]
    counter_pr_after = project["counter_pr_after"]
    
    total_before = len(counter_pr_before.values())
    total_after = len(counter_pr_after.values())
    
    keys_before = counter_pr_before.keys()
    keys_after =  counter_pr_after.keys()
    
    for key in keys_before:
        continue
        counter_pr_before[key] = counter_pr_before[key] / total_before * 100
        
    for key in keys_after:
        continue
        counter_pr_after[key] = counter_pr_after[key] / total_after * 100
    
    delta_b_to_a = keys_before - keys_after
    delta_a_to_b = keys_after - keys_before
    
    
    b_to_a_counter = Counter()
    a_to_b_counter = Counter()
    
    for elm in delta_b_to_a:
        b_to_a_counter[elm] = counter_pr_before[elm]
        
    for elm in delta_a_to_b:
        a_to_b_counter[elm] = counter_pr_after[elm]
        
    project["counter_pr_b_to_a"] = b_to_a_counter
    project["counter_pr_a_to_b"] = a_to_b_counter

In [20]:
for project in projects[:15]:
    
    if "counter_pr_before" not in project:
        continue
    
    print("{}".format(project["name"]))
    print("\tDissapeared:")
    
    total_before = len(project["counter_pr_before"].values())
    total_after = len(project["counter_pr_after"].values())
    
    for item in project["counter_pr_b_to_a"].most_common(10):
        print("{} - {:.0f}/{:.3f}".format(item[0], item[1], item[1] / total_before * 100))
        
    print("\tAppeared:")
    for item in project["counter_pr_a_to_b"].most_common(10):
        print("{} - {:.0f}/{:.3f}".format(item[0], item[1], item[1] / total_after * 100))

Leaflet
	Dissapeared:
('icon', 'size') - 10/0.074
('pleas', 'remov') - 8/0.059
('googl', 'map') - 7/0.052
('scale', 'control') - 6/0.044
('one', 'space-separ') - 6/0.044
('M_VERSION_NUMBER', 'px') - 6/0.044
('M_MENTION', "'ve") - 6/0.044
('us', 'nice') - 5/0.037
('think', 'may') - 5/0.037
('us', 'problem') - 5/0.037
	Appeared:
('x', 'M_ICODE') - 24/0.230
('M_USERNAME', 'import') - 15/0.144
('make', 'M_ICODE') - 9/0.086
('—', 'extend') - 9/0.086
('fraction', 'zoom') - 9/0.086
('\\n', 'M_USERNAME') - 7/0.067
('M_ICODE', 'test') - 6/0.057
('1', 'M_USERNAME') - 6/0.057
('reli', 'M_ICODE') - 6/0.057
('l.latlng', '0') - 6/0.057
RIOT
	Dissapeared:
('iter', 'per') - 18/0.087
('per', 'second') - 16/0.077
('success', 'build') - 13/0.063
('rout', 'protocol') - 9/0.043
('doxygen', 'comment') - 7/0.034
('nativ', 'msba2') - 7/0.034
('use', 'static') - 7/0.034
('default', 'project') - 7/0.034
('dev', 'meet') - 7/0.034
('static', 'array') - 6/0.029
	Appeared:
('descript', 'M_USERNAME') - 111/0.280
('c