### This is a model made to recommend github repositories for differentiating experience - in experimentation phase

In [2]:
import pandas as pd
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import sys

sys.path.append('../../')
from codecompasslib.models.embeddings import load_word2vec_model, vectorize_text

In [3]:
# Grab a dataframe from Data cleaner folder and only import columns necessary for analyzing a user's repositories
df = pd.read_csv('../Data/clean/allReposCleaned.csv', usecols=['owner_user', 'name', 'description', 'language'])
df.head()

Unnamed: 0,name,owner_user,description,language
0,beforerender,spejman,Add hook like afterfilter runs action rendered,Ruby
1,bliptv,spejman,Ruby library Bliptv API,Ruby
2,bn4r,spejman,Bayesian networks Ruby,Ruby
3,cachemoney,spejman,WriteThrough Cacheing Library ActiveRecord,Ruby
4,contacts,spejman,universal interface import email contacts prov...,Ruby


In [4]:
# count unique languges
df['language'].nunique()

# Create list of unique languages with _ prefix
languages = ['_' + language for language in df['language'].unique()]

# one hot encode the languages and don't include the language prefix
df = pd.get_dummies(df, columns=['language'], prefix='')

In [5]:
# Turn df into a repo specific df with owner_user as a unique identifier, appending description and keeping 1 if any of the languages are present in at least one repo

# Create a dictionary for aggregation
aggregation_dict = {
    'name': lambda x: list(x),
    'description': lambda x: list(x)
}

# Add columns for languages
for lang in languages:
    aggregation_dict[lang] = 'max'

# Group by 'owner_user' and aggregate
user_df = df.groupby('owner_user').agg(aggregation_dict).reset_index()

# Display the first few rows of the resulting DataFrame
user_df.head()

Unnamed: 0,owner_user,name,description,_Ruby,_Elixir,_Go,_Shell,_Objective-C,_Dockerfile,_Python,...,_Logos,_JetBrains MPS,_LabVIEW,_Opa,_LOLCODE,_PigLatin,_M,_QML,_Macaulay2,_DTrace
0,0voice,[interviewinternalreference],[2023],False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,0x00b1,"[anelle, mira6vsb, naparifeatures, permission,...","[cycleaccurate Nintendo Game Boy emulator, mir...",False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,18F,"[10xduxapp, 10xduxvulseval, 10xMeL, 10xMLaaS, ...",[10x Dependency Upgrades eXample App backgroun...,False,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,22388o,[21lessonsbook],[21 Lessons book bitcoin journey written bitco...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,233boy,[v2ray],[V2Ray],False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
# first we turn list of names and descriptions into a single string
user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')
user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')
user_df.head()

Unnamed: 0,owner_user,name,description,_Ruby,_Elixir,_Go,_Shell,_Objective-C,_Dockerfile,_Python,...,_Logos,_JetBrains MPS,_LabVIEW,_Opa,_LOLCODE,_PigLatin,_M,_QML,_Macaulay2,_DTrace
0,0voice,interviewinternalreference,2023,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,0x00b1,anelle mira6vsb naparifeatures permission pyto...,cycleaccurate Nintendo Game Boy emulator mira6...,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,18F,10xduxapp 10xduxvulseval 10xMeL 10xMLaaS 10xst...,10x Dependency Upgrades eXample App background...,False,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,22388o,21lessonsbook,21 Lessons book bitcoin journey written bitcoiner,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,233boy,v2ray,V2Ray,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Embedding name and description

In [7]:
word_vect = load_word2vec_model

In [8]:
# Text preprocessing
embedded_user_df = user_df.copy()
embedded_user_df['name'] = user_df['name'].fillna('')  
embedded_user_df['description'] = user_df['description'].fillna('')

embedded_user_df['name_vector'] = embedded_user_df['name'].apply(vectorize_text)
embedded_user_df['description_vector'] = embedded_user_df['description'].apply(vectorize_text)
embedded_user_df
# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)

Unnamed: 0,owner_user,name,description,_Ruby,_Elixir,_Go,_Shell,_Objective-C,_Dockerfile,_Python,...,_LabVIEW,_Opa,_LOLCODE,_PigLatin,_M,_QML,_Macaulay2,_DTrace,name_vector,description_vector
0,0voice,interviewinternalreference,2023,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.9859402179718018, -0.6110885143280029, -0.5..."
1,0x00b1,anelle mira6vsb naparifeatures permission pyto...,cycleaccurate Nintendo Game Boy emulator mira6...,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,"[0.9788694083690643, 2.955499991774559, -0.065...","[-0.6191123082087591, 0.7245606172543305, -0.3..."
2,18F,10xduxapp 10xduxvulseval 10xMeL 10xMLaaS 10xst...,10x Dependency Upgrades eXample App background...,False,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.008359984445728753, 0.5594063227947214, -0..."
3,22388o,21lessonsbook,21 Lessons book bitcoin journey written bitcoiner,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.5443261727690696, -0.56742924451828, -0.929..."
4,233boy,v2ray,V2Ray,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3250,zwily,async awssdkruby bip bitbot BubbleWrap canvasl...,Async utilities node browser official AWS SDK ...,True,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,"[0.027874462015461177, 0.35765249270480126, 0....","[-0.24884995517010491, 1.0730904425028711, -0...."
3251,zxing,zxing,ZXing Zebra Crossing barcode scanning library ...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[-0.11646043509244919, -1.7790699005126953, 2....","[-0.2563125689824422, -1.1216022074222565, 0.2..."
3252,zyedidia,micro,modern intuitive terminalbased text editor,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,"[0.9230323433876038, 0.14018754661083221, 0.78...","[-0.7055508717894554, 0.4106738865375519, -0.1..."
3253,zygmuntz,adversarialvalidation AlpacaGPT classifiercali...,Creating better validation set test examples d...,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,"[-0.0071144066751003265, -0.021610467694699764...","[0.4278504633082029, 0.05336678724210614, -0.6..."


In [19]:
# find index of the user called AntiTyping
user_index = embedded_user_df[embedded_user_df['owner_user'] == 'AntiTyping'].index[0]
user_index

21

In [10]:
# Transform df into something that KNN can use. To be more specific, into a feature matrix
# Create a list of all the vectors
vectors = []
repo_df = embedded_user_df * 1 # convert all boolean values in repo_df to 0 or 1

for row in repo_df.index: 
    vector = []
    for columns in ['name_vector', 'description_vector']:
        if type(repo_df.at[row, columns]) == np.ndarray:
            for element in repo_df.at[row, columns]:
                vector.append(element)
        else: vector.append(repo_df.at[row, columns])
    vectors.append(vector)

In [11]:
# Train Nearest Neighbors Model
k = 5  # Number of neighbors to find
nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')
nn_model.fit(vectors)

In [12]:
target_user = 21
# neighbors excluding the target user
neighbors = nn_model.kneighbors([vectors[target_user]], return_distance=False)[0][1:]
neighbors

array([ 682,  534, 2465, 2212])

In [13]:
# Display users that are similar to the target user and the target user itself (just for checking purposes)
neighborsAndTarget = [target_user] + list(neighbors)
user_df.iloc[neighborsAndTarget]

Unnamed: 0,owner_user,name,description,_Ruby,_Elixir,_Go,_Shell,_Objective-C,_Dockerfile,_Python,...,_Logos,_JetBrains MPS,_LabVIEW,_Opa,_LOLCODE,_PigLatin,_M,_QML,_Macaulay2,_DTrace
21,AntiTyping,emacs24d activeadmin Adv360ProZMK AlgorithmsNY...,fromscratch rebuild Emacs configuration admini...,True,False,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
682,brynary,activeadmin activemerchant actsastaggableon ar...,administration framework Ruby Rails applicatio...,True,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
534,awt,activeshipping actsastaggableon bips bitaddres...,Shipping API extension Active Merchant tagging...,True,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2465,raybaxter,activeadmin annotatemodels attrencrypted backb...,administration framework Ruby Rails applicatio...,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2212,noahd1,alsovalidates authlogic bootstrapsass brakeman...,Validate associated models aggregate errors pr...,True,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False


# Find new repo to recommend with low similarity to the ones of the target user 

### all the df's we have made so far
- `df`: a dataframe with all the repositories and their languages
- `user_df`: a dataframe with the owner_user as the unique identifier and the name, description, and languages of the user's repositories
- `embedded_user_df`: a dataframe with the owner_user as the unique identifier and the name, description, and languages of the user's repositories, as well as the vectorized name and description
- `repo_df`: a dataframe with the owner_user as the unique identifier and the vectorized name and description of the user's repositories
- `vectors`: a list of all the vectors in repo_df
- `nn_model`: a nearest neighbors model trained on the vectors in repo_df
- `target_user`: the index of the target user in user_df
- `neighbors`: the indices of the users in user_df that are similar to the target user
- `neighborsAndTarget`: the indices of the users in user_df that are similar to the target user and the target user itself


In [14]:
# given the user df, use the array outputed from the KNN model to list their repos and put them into a df
neighborsAndTargetRepos = user_df.iloc[neighborsAndTarget]

# neighborsRepos = pd.DataFrame(columns = df.columns)
# for index in neighborsAndTargetRepos.index:
#     neighborsRepos = neighborsRepos.append(df[df['owner_user'] == user_df.at[index, 'owner_user']])


dfs = []
for index in neighborsAndTargetRepos.index:
    dfs.append(df[df['owner_user'] == user_df.at[index, 'owner_user']])

neighborsRepos = pd.concat(dfs, ignore_index=False)


In [15]:
neighborsRepos

Unnamed: 0,name,owner_user,description,_AGS Script,_AMPL,_ANTLR,_API Blueprint,_ASP,_ATS,_ActionScript,...,_Vue,_WebAssembly,_XML,_XQuery,_XSLT,_YAML,_Yacc,_ZIL,_Zig,_mcfunction
2954,emacs24d,AntiTyping,fromscratch rebuild Emacs configuration,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2955,activeadmin,AntiTyping,administration framework Ruby Rails applications,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2956,Adv360ProZMK,AntiTyping,Production repository allnew Advantage360 Prof...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2957,AlgorithmsNYC,AntiTyping,New York Algorithms Data Structures Meetup rep...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2958,alphalens,AntiTyping,Performance analysis predictive alpha stock fa...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37047,markdownblog,noahd1,ruby rails blog engine,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
37048,memcacheclient,noahd1,Ruby library accessing memcached,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
37049,nodefileparser,noahd1,simple powerful module parse file,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
37050,nodeandruby,noahd1,Docker image aimed specific legacy versions Ru...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
# vectorise the name and description of the repos in neighborsRepos
def vectorize_text(text):
    vector_sum = np.zeros(word_vect.vector_size)  # Initialize an array to store the sum of word vectors
    count = 0  # Initialize a count to keep track of the number of words found in the vocabulary
    for word in text.split():
        if word in word_vect.key_to_index:  # Check if the word is in the vocabulary
            vector_sum += word_vect[word]  # Add the word vector to the sum
            count += 1  # Increment the count
    if count > 0:
        return vector_sum / count  # Return the average of word vectors
    else:
        return vector_sum  # Return the zero vector if no words are found in the vocabulary
    
neighborsRepos['name'] = neighborsRepos['name'].fillna('')
neighborsRepos['description'] = neighborsRepos['description'].fillna('')
neighborsRepos['name_vector'] = neighborsRepos['name'].apply(vectorize_text)
neighborsRepos['description_vector'] = neighborsRepos['description'].apply(vectorize_text)
neighborsRepos.drop(['name','description'], axis=1, inplace=True)
neighborsRepos


Unnamed: 0,owner_user,_AGS Script,_AMPL,_ANTLR,_API Blueprint,_ASP,_ATS,_ActionScript,_Ada,_Agda,...,_XML,_XQuery,_XSLT,_YAML,_Yacc,_ZIL,_Zig,_mcfunction,name_vector,description_vector
2954,AntiTyping,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.6085255742073059, 3.800405263900757, -1.04..."
2955,AntiTyping,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[1.2716891765594482, 1.590262532234192, 0.0429...","[0.9091504812240601, 1.573918918768565, -0.164..."
2956,AntiTyping,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.10801148414611816, 1.3554449677467346, -0...."
2957,AntiTyping,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.004930019378662, 1.764566421508789, -0.408..."
2958,AntiTyping,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.02261081784963608, -0.718126630783081, -1.2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37047,noahd1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.059797465801239, 2.090854585170746, -0.1869..."
37048,noahd1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[-0.010539587587118149, -0.006124962121248245,...","[-0.1877035895983378, 1.85484712322553, -0.033..."
37049,noahd1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.06036600088700652, 1.7852990746498107, -0.6..."
37050,noahd1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.6112046152353287, 1.8550066351890564, -0.1..."


In [17]:
target_user = "AntiTyping"

# Function to calculate cosine dissimilarity, handling cases where vectors are all zeros
def calculate_cosine_dissimilarity(vec1, vec2):
    if np.all(vec1 == 0) or np.all(vec2 == 0):
        return 1.0  # Assuming maximum dissimilarity when one vector is all zeros
    return cosine(vec1, vec2)

# Split the DataFrame into target user's repos and others
target_repos = neighborsRepos[neighborsRepos['owner_user'] == target_user]
other_repos = neighborsRepos[neighborsRepos['owner_user'] != target_user]

max_dissimilarity_score = 0
most_dissimilar_repo_info = None

# Iterate over each non-target user repository
for index, other_repo in other_repos.iterrows():
    other_name_vec = np.array(other_repo['name_vector'])
    other_desc_vec = np.array(other_repo['description_vector'])
    
    # For each, calculate dissimilarity with all of target users's repositories
    for _, target_repo in target_repos.iterrows():
        target_name_vec = np.array(target_repo['name_vector'])
        target_desc_vec = np.array(target_repo['description_vector'])
        
        # Calculate dissimilarities
        name_dissimilarity = calculate_cosine_dissimilarity(other_name_vec, target_name_vec)
        desc_dissimilarity = calculate_cosine_dissimilarity(other_desc_vec, target_desc_vec)
        
        # Average or perhaps max here, depending on the exact goal
        average_dissimilarity = (name_dissimilarity + desc_dissimilarity) / 2
        
        # Update if this is the highest dissimilarity score found so far
        if average_dissimilarity > max_dissimilarity_score:
            max_dissimilarity_score = average_dissimilarity
            most_dissimilar_repo_info = (index, other_repo['owner_user'], max_dissimilarity_score)

# Print the most dissimilar repository information
if most_dissimilar_repo_info:
    print("Most dissimilar repository index, owner, and dissimilarity score to 'AntiTyping':")
    print(most_dissimilar_repo_info)


Most dissimilar repository index, owner, and dissimilarity score to 'AntiTyping':
(17447, 'raybaxter', 1.1270448151921002)


In [18]:
# using the tuple with the most dissimilar repo (most_dissimilar_repo_info = (index, other_repo['owner_user'], max_dissimilarity_score)), find the repo in df
most_dissimilar_repo = df.iloc[most_dissimilar_repo_info[0]]

# drop the columns which don not contain the language of the  repo
most_dissimilar_repo = most_dissimilar_repo[most_dissimilar_repo != 0]
most_dissimilar_repo = most_dissimilar_repo.dropna()

# display as df
most_dissimilar_repo = pd.DataFrame(most_dissimilar_repo)
most_dissimilar_repo = most_dissimilar_repo.T
most_dissimilar_repo

Unnamed: 0,name,owner_user,description,_CSS
17447,gmapsradius,raybaxter,Google Map allowing radius added,True
