In [None]:
!pip install PyGithub
!pip install markdown
!pip install beautifulsoup4
!pip install contractions
!pip install gensim
!pip install nltk

import re, string, unicodedata

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

from github import Github
from github import UnknownObjectException

import contractions
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from bs4 import BeautifulSoup
from markdown import markdown
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

## Utility functions


### Noise Removal

In [2]:
def remove_urls(text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return(text)

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")

    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_urls(text)
    return text

def replace_contractions(text):
    return contractions.fix(text)

### Normalization

In [3]:
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def contains_number(value):
    if True in [char.isdigit() for char in value]:
        return True
    return False

def delete_numbers(words):
    new_words = []
    for word in words:
        if contains_number(word) == False:
            new_words.append(word)
    return new_words

def contains_underscore(word):
    for c in word:
      if c == '_':
        return True
    return False

def remove_stopwords(words):
    new_words = []
    STOPWORDS = stopwords.words('english') + ['http', 'https']
    for word in words:
        if word not in STOPWORDS and contains_underscore(word) == False:
            new_words.append(word)
    return new_words

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, get_wordnet_pos(word))
        lemmas.append(lemma)
    return lemmas

def remove_short_and_long_words(words):
    new_words = []
    for word in words:
        if len(word) > 1 and len(word) < 28:
            new_words.append(word)
    return new_words

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = delete_numbers(words)
    words = remove_stopwords(words)
    words = remove_short_and_long_words(words)
    return words

In [4]:
def get_words_from_readme(text):
    is_html = bool(BeautifulSoup(text, "html.parser").find())
    text = re.sub(r"```[^\S\r\n]*[a-z]*\n.*?\n```", '', text, 0, re.DOTALL)
    text = strip_html(text)
    html = text
    if is_html == False:
      html = markdown(text)

    # Noise Removal
    readme_text = denoise_text(html)
    readme_text = replace_contractions(readme_text)

    # Tokenization
    words = nltk.word_tokenize(readme_text)
    
    # Normalization 
    words = normalize(words)
    words = lemmatize_words(words)
    return words

def print_synonyms(word_vectors, most_common_words):
    for word in most_common_words:
        print(word + " : ", end="")
        [print(synonym, end=' | ') for synonym, freq in word_vectors.most_similar(word, topn=5)]
        print()

## Load data

In [5]:
org_name = 'apache'
TOKEN = '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'

g = Github(login_or_token=TOKEN)

readme_files_content_md = []
for repo in g.get_organization(org_name).get_repos():
  if repo.fork == False:
      try:
          readme_file = repo.get_readme()
          content_md = readme_file.decoded_content.decode('utf_8')
          readme_files_content_md.append(content_md)
      except UnknownObjectException:
          continue


## Data Preparation

In [6]:
data = []
for content in readme_files_content_md:
    data.append(get_words_from_readme(content))

## Initialize model

In [7]:
model = Word2Vec(size=300, min_count=5)
model.build_vocab(data)

In [None]:
pretrained = api.load("glove-wiki-gigaword-300")
tmp_file = get_tmpfile("pretrained_vectors.txt")
pretrained.save_word2vec_format(tmp_file)
init_vocab = [list(pretrained.vocab.keys())]

In [9]:
min_count = model.vocabulary.min_count
model.vocabulary.min_count = 1
model.build_vocab(init_vocab, update=True)
model.intersect_word2vec_format(tmp_file, binary=False, lockf=1.0)
model.vocabulary.min_count = min_count

## Synonyms before retraining

In [10]:
most_common_words = ['apache', 'function', 'docker', 'sling', 
                     'application', 'library', 'log',
                     'framework', 'build', 'maven', 'request', 'branch', 
                     'script', 'jira']
d = FreqDist()
for docs in data:
    d.update(FreqDist(docs))

["Word = '{}' Frequency = {}".format(word, d[word]) for word in most_common_words]

["Word = 'apache' Frequency = 5789",
 "Word = 'function' Frequency = 933",
 "Word = 'docker' Frequency = 759",
 "Word = 'sling' Frequency = 1045",
 "Word = 'application' Frequency = 1204",
 "Word = 'library' Frequency = 1183",
 "Word = 'log' Frequency = 573",
 "Word = 'framework' Frequency = 481",
 "Word = 'build' Frequency = 3091",
 "Word = 'maven' Frequency = 1357",
 "Word = 'request' Frequency = 1205",
 "Word = 'branch' Frequency = 1054",
 "Word = 'script' Frequency = 806",
 "Word = 'jira' Frequency = 553"]

In [12]:
print_synonyms(pretrained, most_common_words)

apache : apaches | ah-64 | kiowa | comanche | helicopter | 
function : functions | i.e. | functional | hence | defined | 
docker : oyapock | heatherington | dacre | guiot | puleston | 
sling : blade | swivels | slings | swivel | strap | 
application : applications | applied | apply | applying | user | 
library : libraries | archives | archive | museum | collections | 
log : logs | cabins | cabin | wooden | one-room | 
framework : implementation | frameworks | principles | implement | implementing | 
build : construct | develop | built | create | building | 
maven : mavens | doyenne | etiquette | homemaking | lingo | 
request : requested | requests | requesting | asked | asking | 
branch : branches | railway | established | line | offices | 
script : scripts | screenplay | written | writing | screenwriter | 
jira : producer-director | supernature | flabellina | 5l | selles | 


## Retraining

In [None]:
model.train(data, total_examples=len(data), epochs=50)

## Synonyms after retraining

In [14]:
print_synonyms(model.wv, most_common_words)

apache : use | allow | us | component | build | 
function : functions | method | return | refaddeventlistener | define | 
docker : dockercompose | container | dockerfile | dockerhub | image | 
sling : branchtrunk | launchpad | uimaas | orgapacheslingsuperimposing | jcr | 
application : framework | use | app | project | apps | 
library : libraries | build | include | define | addition | 
log : logs | info | trace | logger | statistic | 
framework : application | different | business | solution | together | 
build : built | run | building | use | test | 
maven : gradle | mvn | pullplugin | artifact | version | 
request : requests | response | could | respond | rejected | 
branch : asfsite | merge | graysvg | asfstaging | the | 
script : scripts | screenplay | file | process | build | 
jira : ticket | componentsgit | expiretime | submit | pluginid | 
