# GitHub Comments Pull_Request

In [1]:
##Startng by importing the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
import spacy
from time import time
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans

In [2]:
##Let's Load the dataset
df = pd.read_csv('C:/Users/Grand/Downloads/github_comments.tsv', sep='\t', 
                                     parse_dates = ['comment_date',
                                                    'merged_at'
                                                   ],
                                     encoding="utf-8-sig")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,comment,comment_date,is_merged,merged_at,request_changes
0,0,@chef/maintainers,2017-05-29 10:54:23,0.0,NaT,0
1,1,omnibus builds are already in pain so merging ...,2017-05-25 20:34:45,1.0,2017-05-25 20:34:27,0
2,2,@chef/client-core reviewage (lets not think to...,2017-05-25 17:21:56,1.0,2017-05-25 19:05:10,0
3,3,This blows up 14.04 because allow-downgrades i...,2017-05-25 19:15:14,0.0,NaT,1
4,4,I know @lamont-granquist and @thommay are goi...,2017-05-23 22:58:13,1.0,2017-05-24 04:40:42,0


In [4]:
##Filling the NAN values with 0 
df.fillna(0, inplace=True)

In [5]:
##To distinguish if the change has been made after a request or not; we can know for sure if it blocking
##The Pull Request(PR) or not.
change_request_comments= df[df['request_changes'].isin([True])]
no_change_request_comments = df[df['request_changes'].isin([False])]
blocking_PR = change_request_comments[change_request_comments['is_merged'].isin([False])]
not_blocking_PR = no_change_request_comments[no_change_request_comments['is_merged'].isin([True])]

# What are the most common problems that appear in these comments?

In [6]:
#We need to find the most commun topic

In [7]:
#Removing @Tag
df.comment = df['comment'].apply(lambda x:re.sub('@[\w.$/-]+','',x))

In [8]:
#lower-case
df['comment'] = df['comment'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['comment'].head()

0                                                     
1    omnibus builds are already in pain so merging ...
2    reviewage (lets not think too hard about the b...
3    this blows up 14.04 because allow-downgrades i...
4    i know and are going to be unhappy about this ...
Name: comment, dtype: object

In [9]:
#Removing Punctuation-modified
df['comment'] = df['comment'].str.replace('[^\w\s.]','')
df['comment'].head()

0                                                     
1    omnibus builds are already in pain so merging ...
2    reviewage lets not think too hard about the br...
3    this blows up 14.04 because allowdowngrades is...
4    i know and are going to be unhappy about this ...
Name: comment, dtype: object

In [10]:
df.comment[10]

'the  hack might be too much would be cool if everyone else wants to just do the fancy  operator and skip the case magic.'

In [11]:
nlp = spacy.load('en')
def findingTopics(row): 
    issue = nlp(row)
    return [chunk.root.lemma_ for chunk in issue.noun_chunks 
             if (lambda c: c.root.pos_ not in ['PRON'] and  
                 c.root.dep_ not in ['ROOT'] and
                 c.root.ent_type_ not in ['DATE','TIME','WORK_OF_ART','QUANTITY'] and
                 re.search('[^a-zA-Z\d\s:]',c.root.lemma_) is None)(chunk)]

In [28]:
blocking_PR['comment'].apply(findingTopics)

3              [downgrade, version, ubuntu, debian, flag]
20      [problem, stub, line, argument, case, box, fip...
29      [what, problem, problem, change, doc, doc, mem...
30                            [thing, user, depth, error]
32                                    [chef, deprecation]
37                                               [server]
48      [type, compatibility, type, xml, xml, applicat...
50      [check, part, xml, xml, type, thing, image, ap...
52                                   [pr, change, detail]
60      [pr, constructor, constructor, thing, constrai...
61      [apology, issue, exceptionininitializererror, ...
62      [sense, missingresourceexception, description,...
67                        [file, update, test, operation]
68           [something, update, dungeon, commit, ending]
72                                                     []
73      [macro, docs, macro, tradition, name, asciidoc...
75                                                [fetch]
76            

In [14]:
##Sentence vectorization
vectorizer = TfidfVectorizer( max_features=100,
                              analyzer = findingTopics,
                              max_df = 5,
                              stop_words='english',
                              vocabulary = None,
                              use_idf=True)
blocking_PR_List = blocking_PR['comment'].tolist()
X = vectorizer.fit_transform(blocking_PR_List)
#LSA :(Latent semantic analysis)
svd = TruncatedSVD(70)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
terms = vectorizer.get_feature_names()
weights =  X.sum(axis=0)
weightIndices = weights.argsort()[::-1]
[terms[i] for i in weightIndices[:20]] 

['clidocumentation',
 'aspect',
 'client',
 'area',
 'byte',
 'backend',
 'artifact',
 'behaviour',
 'entry',
 'block',
 'cassandra',
 'course',
 'enum',
 'break',
 'expiringsession',
 'answer',
 'group',
 'intent',
 'call',
 'improvement']

# Can we cluster the problems by topic/problem type?

In [15]:
##Clustering

In [16]:
## Let's use the kmeans minibatch
km = MiniBatchKMeans(n_clusters=50, init='k-means++', n_init=1,
                     init_size=100, batch_size=100, verbose=False)
km.fit(X)
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
clusterLabels = [ [terms [i] for i in indexes] for indexes in [ cIndex[:5] for cIndex in order_centroids ]]

In [25]:
#Example
issueIdx = 4
print("the cluster for blocking_PR: '{0}' is '{1}".format(blocking_PR_List[issueIdx],clusterLabels[km.labels_[issueIdx]]))

the cluster for blocking_PR: 'This makes me nervous as a structural thing. If a user requests a depth, we should probably honor that or error clearly.' is '['cassandra', 'improvement', 'entry', 'setter', 'separator']


# How long is the resolution time after a change was requested?

In [27]:
## We will compute the mean time differences between PR merged after a comment and comment date 

merged_LastComment = not_blocking_PR[not_blocking_PR.apply(lambda row: row['merged_at'] >= row['comment_date'],  axis=1)]
changes = merged_LastComment.apply(lambda row:row['merged_at'] - row['comment_date'], axis=1)
print("The resolution time after a change was requested is {0}".format(changes.mean()))

The resolution time after a change was requested is 3 days 05:11:29.036334
