# Understanding commit messages
### Using NLP Classification and Sentement analysis
- The goal is to gain insight to commit messages. More detail is explained in the specific sections below.


In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from scipy import stats
from scipy.stats import norm
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

# nltk.download('stopwords') # Download stopwords
plt.rcParams["figure.figsize"] = [13,6]

#### Defining files paths and headers of release and revision csv files.

In [2]:
release_header = ['id', 'target', 'date', 'date_offset', 'name', 'comment', 'author']
revision_header = ['id', 'date', 'date_offset', 'committer_date', 'committer_date_offset', 'type', 'directory', 'message', 'author', 'committer']
# Path to release.csv, and whether to select uncompressed or ocmpressed version
revision_path = 'D:/data/open_source/revision.csv.gz'
release_path = 'D:/data/open_source/release.csv.gz'

In [3]:
def hex_str(str):
    return bytes.fromhex((str).replace('\\x','')).decode('utf-8')

def msg_str(msg):
    return bytes.fromhex(msg[2:]).decode(encoding='ISO-8859-1')

def date_str(str):
    return pd.to_datetime(str, format='%Y-%m-%dT%H:%M:%S.000Z')


In [4]:
def clean_commit_msg(msg):
    return re.sub('[^A-Za-z0-9 ]+', '', msg)
def clean_commit_msgs(msgs):
    return msgs.map(clean_commit_msg)

## Release Commit messages

In [5]:
release_df = pd.read_csv(release_path, names=release_header, converters={'comment':hex_str})
release_df = clean_commit_msgs( release_df['comment'] ).to_frame()
release_df.rename(columns={'comment':'message'}, inplace=True)
release_df['label'] = 'rel'

## Revision Commit messages

In [6]:
# read in revision csv
revision_df = pd.read_csv(revision_path, names=revision_header, converters={'message':msg_str})
# obtain just the message
revision_df = revision_df[['message']]
# clean messages
revision_df.loc[:,['message']] = clean_commit_msgs(revision_df['message']).to_frame()
# add 'revision' labels
revision_df['label'] = 'rev'
    

###### Checking that the import worked

In [7]:
revision_df.tail()

Unnamed: 0,message,label
5188989,Fixes for unary and indexing operations,rev
5188990,Revert Update CONTRIBUTINGmdThis reverts commi...,rev
5188991,Improve SEO tools CSS across themesbzr revid d...,rev
5188992,Update CONTRIBUTINGmdFixing broken issues link,rev
5188993,Only run CoalesceExtSubRegs when we can expect...,rev


In [8]:
release_df.tail()

Unnamed: 0,message,label
10957,Release 1424,rel
10958,Release 340b1,rel
10959,Tagging 11583 release,rel
10960,Tagging 11614 release,rel
10961,AppScale 300,rel


# Classification
- Difference between release and revision commit messages.
- Train NLP Classifier to distinguish between a revision and release commit message.
- Sentement analysis between revision and release commit messages.

#### Merge release and revision to a single dataframe

In [9]:
msgs_df = release_df.append(revision_df)

In [10]:
msgs_df

Unnamed: 0,message,label
0,Release for February 4th 2015,rel
1,tag version 0291,rel
2,Whats new in Tornado 30Mar 29 2013Highlights T...,rel
3,Windows build SickBeardwin32alphabuild469,rel
4,Fabric 116,rel
...,...,...
5188989,Fixes for unary and indexing operations,rev
5188990,Revert Update CONTRIBUTINGmdThis reverts commi...,rev
5188991,Improve SEO tools CSS across themesbzr revid d...,rev
5188992,Update CONTRIBUTINGmdFixing broken issues link,rev


#### Fix indexing issue

In [11]:
msgs_df = msgs_df.reset_index().drop('index', 1)

In [12]:
msgs_df

Unnamed: 0,message,label
0,Release for February 4th 2015,rel
1,tag version 0291,rel
2,Whats new in Tornado 30Mar 29 2013Highlights T...,rel
3,Windows build SickBeardwin32alphabuild469,rel
4,Fabric 116,rel
...,...,...
5199951,Fixes for unary and indexing operations,rev
5199952,Revert Update CONTRIBUTINGmdThis reverts commi...,rev
5199953,Improve SEO tools CSS across themesbzr revid d...,rev
5199954,Update CONTRIBUTINGmdFixing broken issues link,rev


In [13]:
 msgs_df

Unnamed: 0,message,label
0,Release for February 4th 2015,rel
1,tag version 0291,rel
2,Whats new in Tornado 30Mar 29 2013Highlights T...,rel
3,Windows build SickBeardwin32alphabuild469,rel
4,Fabric 116,rel
...,...,...
5199951,Fixes for unary and indexing operations,rev
5199952,Revert Update CONTRIBUTINGmdThis reverts commi...,rev
5199953,Improve SEO tools CSS across themesbzr revid d...,rev
5199954,Update CONTRIBUTINGmdFixing broken issues link,rev


### Create vectorizer and vectorize the messages

In [19]:
sample_size = 100000
release_size = release_df.shape[0]
revision_size = revision_df.shape[0]

#### Free up space for next processes

In [20]:
del revision_df
del release_df

In [21]:
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
msgs_vect = vectorizer.fit_transform( msgs_df[:sample_size]['message'] ).toarray()

In [22]:
msgs_vect

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Split the vectorized commit messages into training and testing datas.

In [23]:
x_train, x_test, y_train, y_test = train_test_split(msgs_vect, msgs_df[:sample_size]['label'], test_size=0.2, random_state=0)

### Using the Random Forest Calssifier, train the model.

In [24]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

### Using the trained model, predict using the testing data

In [25]:
predictions = text_classifier.predict(x_test)

In [26]:
text_classifier.predict(x_test)

array(['rel', 'rev', 'rev', ..., 'rev', 'rev', 'rev'], dtype=object)

### Print the Confusion Matrix, Classification Report, and the Accuracy Score

In [27]:
print("Confusion Matrix \n\n release revision \n", confusion_matrix(y_test,predictions), "\n\n")
print("Classification Report\n\n", classification_report(y_test,predictions),"\n\n")
print("Accuracy Score \n\n", accuracy_score(y_test, predictions), "\n\n")

Confusion Matrix 

 release revision 
 [[ 1791   396]
 [   84 17729]] 


Classification Report

               precision    recall  f1-score   support

         rel       0.96      0.82      0.88      2187
         rev       0.98      1.00      0.99     17813

    accuracy                           0.98     20000
   macro avg       0.97      0.91      0.93     20000
weighted avg       0.98      0.98      0.98     20000
 


Accuracy Score 

 0.976 




### Doing random sample to obseve

In [28]:
import random
s_i = random.randint(0,sample_size)
sample_x =  msgs_df['message'][s_i]
sample_y = msgs_df['label'][s_i]
sample_vect = msgs_vect[s_i]
sample_pred = text_classifier.predict([sample_vect])
sample_correct = "CORRECT" if sample_y == sample_pred  else "FALSE"

print("Message: '" + sample_x + "' --> Predicted: " + sample_pred + ", Expected: " + sample_y + ", " + sample_correct )

["Message: 'Adds support for versioned schema validation for microversions apiAdds the ability to specify minimum and maximum API microversionversions on jsonschema validation decorators Validation willonly occur throught the decorator if the incoming request versionmatches the version range specified If no range is specified thenvalidation will always occurPartially implements blueprint apimicroversionsChangeId Ia71963161966af3ca0e6e30e2245f12120f8f8d1' --> Predicted: rev, Expected: rev, CORRECT"]


### Sum all of the release vectors and revision vectors
##### Column wise, so to create a single vector that represents all of the rel / rev

In [29]:
rel_vect = msgs_vect[:10961].sum(axis=0)
rev_vect = msgs_vect[10961:-1].sum(axis=0)

### Creation a dictionary array. Holds each of the words used in the tokenizer and the index at which it is located.

In [30]:
# vectorizer.vocabulary_
dictionary = sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1])
dictionary[-10:-1]

[('xml', 2490),
 ('yaml', 2491),
 ('year', 2492),
 ('yet', 2493),
 ('yield', 2494),
 ('yum', 2495),
 ('zero', 2496),
 ('zip', 2497),
 ('zone', 2498)]

### Create a list that holds the token and the value (~count)

In [31]:
rel_list = sorted([(dictionary[i][0],int(rel_vect[i])) for i, x in enumerate(rel_vect)], key=lambda x: -x[1])
rev_list = sorted([(dictionary[i][0],int(rev_vect[i])) for i, x in enumerate(rev_vect)], key=lambda x: -x[1])

### The top 15 most important tokens for release and revision

In [32]:
rel_list[:15]

[('release', 2812),
 ('pgp', 1783),
 ('version', 1499),
 ('tagging', 972),
 ('signature', 884),
 ('signatureversion', 603),
 ('gnupg', 591),
 ('tag', 344),
 ('android', 237),
 ('new', 199),
 ('build', 107),
 ('hotfix', 104),
 ('bump', 101),
 ('released', 97),
 ('hudson', 73)]

In [33]:
rev_list[:15]

[('update', 4763),
 ('merge', 3644),
 ('add', 2977),
 ('fix', 2947),
 ('link', 2260),
 ('commit', 2257),
 ('issues', 2213),
 ('broken', 2203),
 ('contributingmdfixing', 2153),
 ('revert', 2071),
 ('reverts', 1996),
 ('contributingmdthis', 1956),
 ('f4c7059a0b32e075aced9c578728f5f5ba819d5a', 1956),
 ('request', 1758),
 ('pull', 1737)]