# Understanding commit messages
### Using NLP Classification and Sentement analysis
- The goal is to gain insight to commit messages. More detail is explained in the specific sections below.


In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from scipy import stats
from scipy.stats import norm
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

# nltk.download('stopwords') # Download stopwords
plt.rcParams["figure.figsize"] = [13,6]

#### Defining files paths and headers of release and revision csv files.

In [2]:
release_header = ['id', 'target', 'date', 'date_offset', 'name', 'comment', 'author']
revision_header = ['id', 'date', 'date_offset', 'committer_date', 'committer_date_offset', 'type', 'directory', 'message', 'author', 'committer']
# Path to release.csv, and whether to select uncompressed or ocmpressed version
revision_path = 'D:/data/open_source/revision.csv.gz'
release_path = 'D:/data/open_source/release.csv.gz'

In [3]:
def hex_str(str):
    return bytes.fromhex((str).replace('\\x','')).decode('utf-8')

def msg_str(msg):
    return bytes.fromhex(msg[2:]).decode(encoding='ISO-8859-1')

def date_str(str):
    return pd.to_datetime(str, format='%Y-%m-%dT%H:%M:%S.000Z')

## Release Commit messages

In [4]:
release_df = pd.read_csv(release_path, names=release_header, converters={'comment':hex_str})
release_df = release_df['comment']

## Revision Commit messages
##### Uses Chunking to import data.

In [5]:
# Initialize dateframe
revision_df = pd.DataFrame()

# Chunk in the data, adding the messages to the df created above
chunksize = 10 ** 4
for chunk in pd.read_csv(revision_path, chunksize=chunksize, names=revision_header, converters={'message':msg_str}):
    revision_df = revision_df.append( chunk['message'].to_frame() )

###### Checking that the import worked

In [6]:
revision_df.tail()

Unnamed: 0,message
5188989,Fixes for unary and indexing operations.\n
5188990,"Revert ""Update CONTRIBUTING.md""\n\nThis revert..."
5188991,Improve SEO tools CSS across themes\n\nbzr rev...
5188992,Update CONTRIBUTING.md\n\nFixing broken issues...
5188993,Only run CoalesceExtSubRegs when we can expect...


# Classification
- Difference between release and revision commit messages.
- Train NLP Classifier to distinguish between a revision and release commit message.
- Sentement analysis between revision and release commit messages.

#### Functions to remove special characters from commit messages

In [7]:
def clean_commit_msg(msg):
    return re.sub('[^A-Za-z0-9 ]+', '', msg)
def clean_commit_msgs(msgs):
    return msgs.map(clean_commit_msg)

In [8]:
print( release_df.head() )
print( revision_df.head() )

                                           id                date  \
0  \xae671a0067dbeabbc3cb546705edc1f81f71a193 2015-02-03 21:58:44   
1  \x33e2c27d1ec224a54ed7ca4a6e09c352e92a227d 2013-07-05 04:58:14   
2  \x1f9bcd823807f41afaab6b74b34473531ca7eb30 2013-03-29 13:02:20   
3  \x3206fb28a040494bac6973310e7f21f031989da6 2010-12-09 04:14:19   
4  \x652fc27cf9fe9262d2c941d6385043efa41016da 2011-11-08 01:22:48   

                                     name  \
0  \x72656c656173652d323031352d30322d3034   
1                        \x76302e32392e31   
2                          \x76332e302e30   
3                    \x6275696c642d343639   
4                            \x312e312e36   

                                             comment  author  
0                   Release for February 4th, 2015\n   91949  
1                               tag version 0.29.1\n  140982  
3     Windows build SickBeard-win32-alpha-build469\n  875380  
4                                     Fabric 1.1.6\n   63770  

#### Getting just the commit messages, and clean them (remove special chars)

In [9]:
rel_msgs = release_df.iloc[:,3]

In [11]:
rel_msgs = clean_commit_msgs(rel_msgs)
revision_df = clean_commit_msgs(revision_df['message'])

MemoryError: Unable to allocate array with shape (5188994,) and data type float64

#### Labels are *all* release. This should be changed when revisions are being properly used.

In [None]:
rel_df = rel_msgs.to_frame()
rel_df['label'] = 'rel'

rel_df.head()

In [None]:
revision_df['label'] = 'rev'

revision_df.head()

In [None]:
msgs_df = revision_df.append(rel_df)

#### Create vectorizer and vectorize the messages

In [None]:
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
msg_vect = vectorizer.fit_transform( msgs['message'] ).toarray()

In [None]:
msg_vect

#### Split the vectorized commit messages into training and testing datas.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(msg_vect, msg_df['label'], test_size=0.2, random_state=0)

#### Using the Random Forest Calssifier, train the model.

In [None]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(x_train, y_train)

### Using the trained model, predict using the testing data
#### This is not useful for now, since *only* releases are being used. Only 1 label, so no difference 

In [None]:
predictions = text_classifier.predict(x_test)

In [None]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 



# Commit Message Analytics
- Most common words used in commit messages.
- Average length in a commit message.
- ??? ML not used much in this section, so may not be of priority.

In [None]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
# nltk.download('punkt')

In [None]:
# msgs = release_df.iloc[:,3]
# msgs = clean_commit_msgs(msgs)

msgs = rel_msgs

### Tokenizing the commit messages. Stemmizing and trimming long words.
#### Should ensure the cleaning function gets rid of meaningless  numbers / strings. This is a primitive way to to tokenize, so the method used in the 'Classification' section above is likely preferrable.

In [None]:
stop_words=set(stopwords.words("english"))
ps = PorterStemmer()
word_max_length = 20
# tokenize, stemmize, and remove stop words
msgs_token = []
for msg in msgs:
    msg = clean_commit_msg(msg)
    for w in word_tokenize(msg):
        if w not in stop_words and len(w) <= word_max_length:
            w_stem = ps.stem(w)
            msgs_token.append(w_stem)

#### Number of tokens

In [None]:
len(msgs_token)

### Attempting to gain some analytics of the data.
- Most common words ?

In [None]:
msg_df = pd.Series(msgs_token).to_frame()

In [None]:
msg_df['count'] = 0

In [None]:
msg_df.head()

In [None]:
msg_df.columns = ['word', 'count']

In [None]:
msg_counts = msg_df.groupby('word').count()

In [None]:
msg_counts

In [None]:
msg_counts['count'].idxmax()

In [None]:
msg_counts.count