# How to Rank text context by semantic similarity
Use [this]('https://towardsdatascience.com/how-to-rank-text-content-by-semantic-similarity-4d2419a84c32') for actual github data pulling by `machine learning`

In [1]:
# dependencies

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
import plotly.express as px

## Stop word settings in english

In [2]:
# download stopwords list

# nltk.download('stopwords') # this is done in dockerfile
# nltk.download('wordnet') # this is done in docekrfile

stop_words = set(stopwords.words('english'))

# interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``',"''", '`']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

# lemmatize the stop words
tokenizer = LemmaTokenizer()
token_stop = tokenizer(' '.join(stop_words))

# create tf-idf model
vectorizer = TfidfVectorizer(stop_words = token_stop,
                            tokenizer = tokenizer)

## Data Preparations

In [3]:
df = pd.read_csv('./data/github_ml.csv')
df.shape

(2361, 35)

In [4]:
df.head(10)

Unnamed: 0,Node ID,full_name,About_+_Topics,search_query,url,html_url,created_date,created_year,owner_name_raw,owner name_revised,...,dependent_count,Educational,Stars Norm,Centrality Norm,Degree Norm,Fork Norm,Watcher Norm,Dependent Norm,# of Repos Norm,Quick Rank
0,0,borisbanushev/stockpredictionai,In this noteboook I will create a compl...,neural-network stars:>500,https://api.github.com/repos/borisbanushev/sto...,https://github.com/borisbanushev/stockpredicti...,1/9/19,1/1/19,borisbanushev,,...,0.0,User,0.497527,0.309644,0.653847,0.601694,0.634295,0.477496,0.381682,0.460542
1,1,edvardHua/PoseEstimationForMobile,:dancer: Real-time single person pose estimat...,deep-neural-networks stars:>500,https://api.github.com/repos/edvardHua/PoseEst...,https://github.com/edvardHua/PoseEstimationFor...,4/12/18,1/1/18,edvardHua,,...,0.0,User,0.351079,0.772767,0.653847,0.412062,0.365602,0.477496,0.381682,0.446377
2,2,jivoi/awesome-ml-for-cybersecurity,:octocat: Machine Learning for Cyber Security...,machine-learning stars:>500,https://api.github.com/repos/jivoi/awesome-ml-...,https://github.com/jivoi/awesome-ml-for-cybers...,12/20/16,1/1/16,jivoi,,...,0.0,User,0.602809,0.833017,0.487403,0.59076,0.79913,0.477496,0.381682,0.522153
3,3,yhangf/ML-NOTE,:orange_book:ÊÖ¢ÊÖ¢Êï¥ÁêÜÊâÄÂ≠¶ÁöÑÊú∫Âô®Â≠¶‰π...,ml stars:>500,https://api.github.com/repos/yhangf/ML-NOTE,https://github.com/yhangf/ML-NOTE,5/12/18,1/1/18,yhangf,,...,0.0,User,0.33324,0.791447,0.989792,0.391961,0.348361,0.477496,0.381682,0.476321
4,4,airsonic/airsonic,":satellite: :cloud: :notes:Airsonic, a Free a...",ai stars:>500,https://api.github.com/repos/airsonic/airsonic,https://github.com/airsonic/airsonic,7/4/17,1/1/17,airsonic,Airsonic,...,0.0,No,0.415697,0.282288,0.850061,0.407608,0.38186,0.477496,0.415614,0.441558
5,5,google-research/recsim,A Configurable Recommender Systems Simulation...,artificial-intelligence stars:>500,https://api.github.com/repos/google-research/r...,https://github.com/google-research/recsim,9/25/19,1/1/19,google-research,Alphabet,...,18.0,No,0.331146,0.946009,0.572237,0.385843,0.345923,0.478024,1.0,0.758116
6,6,AI4Finance-LLC/FinRL,A Deep Reinforcement Learning Library for Aut...,deep-learning stars:>500,https://api.github.com/repos/AI4Finance-LLC/FinRL,https://github.com/AI4Finance-LLC/FinRL,7/26/20,1/1/20,AI4Finance-LLC,AI4Finance,...,5.0,No,0.44231,0.251879,0.094505,0.463626,0.446104,0.477643,0.524587,0.432136
7,7,scikit-learn-contrib/imbalanced-learn,A Python Package to Tackle the Curse of Imbal...,machine-learning stars:>500,https://api.github.com/repos/scikit-learn-cont...,https://github.com/scikit-learn-contrib/imbala...,8/16/14,1/1/14,scikit-learn-contrib,Scikit-learn,...,5764.0,Yes,0.654737,0.31311,0.928403,0.550287,0.490726,0.643693,0.729487,0.65847
8,8,google/aiyprojects-raspbian,"API libraries, samples, and system images for...",ai stars:>500,https://api.github.com/repos/google/aiyproject...,https://github.com/google/aiyprojects-raspbian,4/20/17,1/1/17,google,Alphabet,...,0.0,No,0.394367,0.335148,0.403136,0.479419,0.517064,0.477496,1.0,0.712913
9,9,PacktPublishing/Advanced-Deep-Learning-with-Keras,"Advanced Deep Learning with Keras, published ...",deep-learning stars:>500,https://api.github.com/repos/PacktPublishing/A...,https://github.com/PacktPublishing/Advanced-De...,3/15/18,1/1/18,PacktPublishing,PacktPublishing,...,0.0,Yes,0.360846,0.263596,0.794757,0.466878,0.364361,0.477496,0.63174,0.540914


In [5]:
df.owner_type.unique()

array(['User', 'Organization'], dtype=object)

In [6]:
# split full_name to name and repo. Name will not be used but 'owner name_revised' is adjusted owner name
df[['Owner','Repository']] = df['full_name'].str.split('/', expand = True)

In [7]:
# drop unnecessary columns
df_select = df[['Node ID','Repository','About_+_Topics','owner name_revised',
                'owner_type','Stars Norm','Fork Norm','Watcher Norm']]

# filter by organization and rename columns
df_select = (df_select
            .loc[df_select['owner_type'] == 'Organization']
            .rename(columns = {'Node ID':'NodeID',
                              'About_+_Topics':'About_Topics',
                              'owner name_revised':'OwnerName',
                              'owner_type':'OwnerType',
                              'Stars Norm':'StarsNorm',
                              'Fork Norm':'ForkNorm',
                              'Watcher Norm':'WatcherNorm'}))
print(df_select.shape)
print(df_select.head())

(989, 8)
   NodeID            Repository  \
4       4              airsonic   
5       5                recsim   
6       6                 FinRL   
7       7      imbalanced-learn   
8       8  aiyprojects-raspbian   

                                        About_Topics     OwnerName  \
4   :satellite: :cloud: :notes:Airsonic, a Free a...      Airsonic   
5   A Configurable Recommender Systems Simulation...      Alphabet   
6   A Deep Reinforcement Learning Library for Aut...    AI4Finance   
7   A Python Package to Tackle the Curse of Imbal...  Scikit-learn   
8   API libraries, samples, and system images for...      Alphabet   

      OwnerType  StarsNorm  ForkNorm  WatcherNorm  
4  Organization   0.415697  0.407608     0.381860  
5  Organization   0.331146  0.385843     0.345923  
6  Organization   0.442310  0.463626     0.446104  
7  Organization   0.654737  0.550287     0.490726  
8  Organization   0.394367  0.479419     0.517064  


In [8]:
# count number of OwnerName and show top 20
df_select.groupby(['OwnerName']).size().reset_index(name='Counts').sort_values(by='Counts',ascending=False).head(20)

Unnamed: 0,OwnerName,Counts
31,Alphabet,48
244,Microsoft,43
132,Facebook,33
131,FOSSASIA,31
377,TensorFlow,28
260,NVIDIA,19
183,Intel,18
33,Amazon,13
376,Tencent,11
431,aio-libs,9


## Document Scores

In [9]:
# create document list
documents = df_select['About_Topics'].to_numpy()

# converts strings into lists of single word 'tokens'
# this produces a sparse matrix of document vectors containing the term frequencies
doc_vectors = vectorizer.fit_transform(documents)

# check how the tabular is changed
print(documents.shape)
print(doc_vectors.shape)

(989,)
(989, 4643)


In [10]:
# calculate similarity scores (0 to 1)
cosine_similarities = linear_kernel(doc_vectors)

# print matrix shape
# print matrix
print('shape : ', cosine_similarities.shape)
print('matrix array : ', cosine_similarities)

shape :  (989, 989)
matrix array :  [[1.00000000e+00 1.46465101e-03 8.69416967e-04 ... 8.66638971e-02
  8.87361142e-04 9.66560509e-04]
 [1.46465101e-03 1.00000000e+00 1.61912974e-03 ... 2.15610552e-02
  2.00144441e-02 7.57195177e-03]
 [8.69416967e-04 1.61912974e-03 1.00000000e+00 ... 2.75240327e-02
  9.80952330e-04 5.57714229e-03]
 ...
 [8.66638971e-02 2.15610552e-02 2.75240327e-02 ... 1.00000000e+00
  3.22889164e-02 7.64963380e-03]
 [8.87361142e-04 2.00144441e-02 9.80952330e-04 ... 3.22889164e-02
  1.00000000e+00 7.64180510e-03]
 [9.66560509e-04 7.57195177e-03 5.57714229e-03 ... 7.64963380e-03
  7.64180510e-03 1.00000000e+00]]


## Find a relationship and store in a tabular format
Tabular includes ID, ID, and score (to be used for weighted relationship)

In [11]:
# histogram to see scoring distribution
# fig = px.histogram(cosine_similarities.flatten(), template = 'plotly_dark')
# fig.show()

In [12]:
# threshold score
score_limit = 0.25

# assign Node ID to bthe score matrix
df_similarity = pd.DataFrame(data = cosine_similarities,
                            index = df_select['NodeID'],
                            columns = df_select['NodeID'])
df_similarity.head()

NodeID,4,5,6,7,8,9,11,12,13,17,...,2349,2350,2351,2353,2355,2356,2357,2359,2360,2361
NodeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.0,0.001465,0.000869,0.001591,0.029294,0.001525,0.001781,0.001592,0.002234,0.001481,...,0.001142,0.001059,0.001638,0.001401,0.001347,0.001001,0.00117,0.086664,0.000887,0.000967
5,0.001465,1.0,0.001619,0.012466,0.050238,0.049859,0.003317,0.012475,0.00416,0.011604,...,0.018557,0.017217,0.026614,0.00261,0.010549,0.007843,0.026384,0.021561,0.020014,0.007572
6,0.000869,0.001619,1.0,0.009182,0.014295,0.021937,0.001969,0.009188,0.082826,0.0144,...,0.001262,0.001171,0.00181,0.008085,0.001489,0.001107,0.001293,0.027524,0.000981,0.005577
7,0.001591,0.012466,0.009182,1.0,0.032519,0.016104,0.003604,0.05015,0.023592,0.012607,...,0.030067,0.099362,0.003313,0.014798,0.044194,0.057201,0.002367,0.012594,0.023366,0.053936
8,0.029294,0.050238,0.014295,0.032519,1.0,0.002394,0.002797,0.0025,0.003508,0.002326,...,0.012564,0.001664,0.024615,0.021063,0.014817,0.024491,0.001837,0.095141,0.021709,0.043861


In [13]:
# get number of rows(indexes) and range of rows
row_len = len(df_similarity.index)
row_range = range(row_len)

# get number of cols)columns) and range of columns
col_len = len(df_similarity.columns)

print('N of indexes : ', row_len)
print('Range of indexes : ', row_range)
print('N of colunns : ', col_len)

N of indexes :  989
Range of indexes :  range(0, 989)
N of colunns :  989


In [14]:
# set initial frame of Node1, Node2, and Score
Node1 = []
Node2 = []
Score = []

# convert df_similarity to numpy
np_similarity = df_similarity.to_numpy()

# convert index and column of df_similarity to numpy list
np_similarity_index = df_similarity.index.to_numpy()
np_similarity_columns = df_similarity.columns.to_numpy()

# create array for a combination which we would like to create a relationship in graph database
for i in row_range:
    for j in range(i+1, col_len): # remove same ID x ID and cut by half of the 2D matrix
        if np_similarity[i][j] >= score_limit:
            Node1 += [np_similarity_index[i]]
            Node2 += [np_similarity_columns[j]]
            Score += [np_similarity[i][j]]

In [15]:
# convert to dataframe
df_relation = pd.DataFrame([Node1, Node2, Score], index = ['Node1_ID', 'Node2_ID', 'Score']).T

# convert data type from float to integer
df_relation.Node1_ID = df_relation.Node1_ID.astype(int)
df_relation.Node2_ID = df_relation.Node2_ID.astype(int)

print('data type : \n', df_relation.dtypes)
print('\ndata shape : ', df_relation.shape)
print('\nfirst 5 rows in the table : \n', df_relation.head())

data type : 
 Node1_ID      int64
Node2_ID      int64
Score       float64
dtype: object

data shape :  (1208, 3)

first 5 rows in the table : 
    Node1_ID  Node2_ID     Score
0         5       645  0.253476
1         5      1716  0.314549
2         7      2126  0.288205
3         7      2164  0.527469
4         8       869  0.327332


In [16]:
# save to csv

df_select.to_csv('./data/node.csv')
df_relation.to_csv('./data/relation.csv')