# Understanding commit messages
- The goal is to gain insight to commit messages. What are the most common phrases, words, or sayings?

In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from scipy import stats
from scipy.stats import norm
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

# nltk.download('stopwords') # Download stopwords
plt.rcParams["figure.figsize"] = [13,6]

In [2]:
release_header = ['id', 'target', 'date', 'date_offset', 'name', 'comment', 'author']
revision_header = ['id', 'date', 'date_offset', 'committer_date', 'committer_date_offset', 'type', 'directory', 'message', 'author', 'committer']
# Path to release.csv, and whether to select uncompressed or ocmpressed version
revision_path = 'D:/data/open_source/revision.csv.gz'
release_path = 'D:/data/open_source/release.csv.gz'

In [3]:
def hex_str(str):
    return bytes.fromhex((str).replace('\\x','')).decode('utf-8')

def date_str(str):
    return pd.to_datetime(str, format='%Y-%m-%dT%H:%M:%S.000Z')

In [4]:
release_df = pd.read_csv(release_path, names=release_header, converters={'date':date_str,'comment':hex_str})
release_df = release_df[['id','date','name','comment','author']]

Revision is giving me trouble during importing.

The kernal keeps dying. The file may be too large?

In [5]:
# rev_df = pd.read_csv(revision_path, header=None)
# rev_df.head()

# chunksize = 10 ** 7
# for chunk in pd.read_csv(revision_path, chunksize=chunksize):
#     print(chunk.head())
    
    
# rev_df.columns = ['id', 'date', 'date_offset', 'committer_date', 'committer_date_offset', 'type', 'directory', 'message', 'author', 'committer']
# rev_df = rev_df[['id', 'date', 'message']]
# rev_d

# rev_df['id'] = rev_df['id'].apply(lambda x: x[2:])
# rev_df['date'] = rev_df['date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%dT%H:%M:%S.000Z'))
# rev_df['message'] = rev_df['message'].apply(lambda x: bytes.fromhex(x[2:]).decode(encoding='ISO-8859-1'))
# rev_df

# Classification

In [6]:
def clean_commit_msg(msg):
    return re.sub('[^A-Za-z0-9 ]+', '', msg)
def clean_commit_msgs(msgs):
    return msgs.map(clean_commit_msg)

In [7]:
# release_df.head()

Get just the commit messages, and clean them (remove special chars)

In [8]:
msgs = release_df.iloc[:,3]

In [9]:
msgs = clean_commit_msgs(msgs)

In [10]:
msgs_size = msgs.shape[0]
labels = ["release" for x in range(msgs_size)]

In [11]:
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
msg_vect = vectorizer.fit_transform(msgs).toarray()

In [12]:
msg_vect

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.02793413],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [13]:
x_train, x_test, y_train, y_test = train_test_split(msg_vect, labels, test_size=0.2, random_state=0)

In [14]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
predictions = text_classifier.predict(x_test)

In [16]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[2193]]
              precision    recall  f1-score   support

     release       1.00      1.00      1.00      2193

    accuracy                           1.00      2193
   macro avg       1.00      1.00      1.00      2193
weighted avg       1.00      1.00      1.00      2193

1.0


- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 

- 
- 



# Analytics

In [17]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
# nltk.download('punkt')

In [18]:
# msgs = release_df.iloc[:,3]
# msgs = clean_commit_msgs(msgs)

### Tokenizing the messages

In [19]:
stop_words=set(stopwords.words("english"))
ps = PorterStemmer()
word_max_length = 20
# tokenize, stemmize, and remove stop words
msgs_token = []
for msg in msgs:
    msg = clean_commit_msg(msg)
    for w in word_tokenize(msg):
        if w not in stop_words and len(w) <= word_max_length:
            w_stem = ps.stem(w)
            msgs_token.append(w_stem)

In [20]:
len(msgs_token)

93283

In [40]:
msg_df = pd.Series(msgs_token).to_frame()

In [83]:
msg_df['count'] = 0

In [84]:
msg_df.head()

Unnamed: 0,word,count
0,releas,0
1,februari,0
2,4th,0
3,2015,0
4,tag,0


In [92]:
msg_df.columns = ['word', 'count']

In [86]:
msg_counts = msg_df.groupby('word').count()

In [89]:
msg_counts

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
0,8
00,2
000108,1
000begin,1
000dev20151006begin,1
...,...
zu,1
zuckschwerdt,2
zuhao,1
zum,1


In [91]:
msg_counts['count'].idxmax()

'pgp'

In [88]:
msg_counts.count

<bound method DataFrame.count of                      count
word                      
0                        8
00                       2
000108                   1
000begin                 1
000dev20151006begin      1
...                    ...
zu                       1
zuckschwerdt             2
zuhao                    1
zum                      1
zuo                      2

[14855 rows x 1 columns]>