# Understanding commit messages
### Using NLP Classification and Sentement analysis
- The goal is to gain insight to commit messages. More detail is explained in the specific sections below.


In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from scipy import stats
from scipy.stats import norm
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

# nltk.download('stopwords') # Download stopwords
plt.rcParams["figure.figsize"] = [13,6]

#### Defining files paths and headers of release and revision csv files.

In [2]:
release_header = ['id', 'target', 'date', 'date_offset', 'name', 'comment', 'author']
revision_header = ['id', 'date', 'date_offset', 'committer_date', 'committer_date_offset', 'type', 'directory', 'message', 'author', 'committer']
# Path to release.csv, and whether to select uncompressed or ocmpressed version
revision_path = 'D:/data/open_source/revision.csv.gz'
release_path = 'D:/data/open_source/release.csv.gz'

In [3]:
def hex_str(str):
    return bytes.fromhex((str).replace('\\x','')).decode('utf-8')

def msg_str(msg):
    return bytes.fromhex(msg[2:]).decode(encoding='ISO-8859-1')

def date_str(str):
    return pd.to_datetime(str, format='%Y-%m-%dT%H:%M:%S.000Z')


In [4]:
def clean_commit_msg(msg):
    return re.sub('[^A-Za-z0-9 ]+', '', msg)
def clean_commit_msgs(msgs):
    return msgs.map(clean_commit_msg)

## Release Commit messages

In [5]:
release_df = pd.read_csv(release_path, names=release_header, converters={'comment':hex_str})
release_df = clean_commit_msgs( release_df['comment'] ).to_frame()
release_df.rename({'comment':'message'})
release_df['label'] = 'rel'

## Revision Commit messages
##### Uses Chunking to import data.

In [7]:
# Initialize dataframe
revision_df = pd.DataFrame()

# Chunk in the data, adding the data to the df created above
chunksize = 10 ** 3
for chunk in pd.read_csv(revision_path, chunksize=chunksize, names=revision_header, converters={'message':msg_str}):
    chunk.loc[:,['message']]= clean_commit_msgs(chunk['message'])
#     chunk['label'] = 'rev'
    revision_df = revision_df.append( chunk )
    

MemoryError: 

###### Checking that the import worked

In [None]:
revision_df.tail()

In [None]:
release_df.tail()

# Classification
- Difference between release and revision commit messages.
- Train NLP Classifier to distinguish between a revision and release commit message.
- Sentement analysis between revision and release commit messages.

#### Functions to remove special characters from commit messages

In [None]:
print( release_df.head() )
print( revision_df.head() )

#### Getting just the commit messages, and clean them (remove special chars)

In [None]:
revision_df.loc[:14].head()

In [None]:
msgs_df = revision_df.append(release_df)

#### Create vectorizer and vectorize the messages

In [None]:
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
msg_vect = vectorizer.fit_transform( msgs['message'] ).toarray()

In [None]:
msg_vect

#### Split the vectorized commit messages into training and testing datas.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(msg_vect, msg_df['label'], test_size=0.2, random_state=0)

#### Using the Random Forest Calssifier, train the model.

In [None]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(x_train, y_train)

### Using the trained model, predict using the testing data

In [None]:
predictions = text_classifier.predict(x_test)

In [None]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 



# Commit Message Analytics
- Most common words used in commit messages.
- Average length in a commit message.
- ??? ML not used much in this section, so may not be of priority.

In [None]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
# nltk.download('punkt')

In [None]:
# msgs = release_df.iloc[:,3]
# msgs = clean_commit_msgs(msgs)

msgs = rel_msgs

### Tokenizing the commit messages. Stemmizing and trimming long words.
#### Should ensure the cleaning function gets rid of meaningless  numbers / strings. This is a primitive way to to tokenize, so the method used in the 'Classification' section above is likely preferrable.

In [None]:
stop_words=set(stopwords.words("english"))
ps = PorterStemmer()
word_max_length = 20
# tokenize, stemmize, and remove stop words
msgs_token = []
for msg in msgs:
    msg = clean_commit_msg(msg)
    for w in word_tokenize(msg):
        if w not in stop_words and len(w) <= word_max_length:
            w_stem = ps.stem(w)
            msgs_token.append(w_stem)

#### Number of tokens

In [None]:
len(msgs_token)

### Attempting to gain some analytics of the data.
- Most common words ?

In [None]:
msg_df = pd.Series(msgs_token).to_frame()

In [None]:
msg_df['count'] = 0

In [None]:
msg_df.head()

In [None]:
msg_df.columns = ['word', 'count']

In [None]:
msg_counts = msg_df.groupby('word').count()

In [None]:
msg_counts

In [None]:
msg_counts['count'].idxmax()

In [None]:
msg_counts.count