### This is a model made to recommend github repositories for differentiating experience - in experimentation phase

In [28]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from sklearn.neighbors import NearestNeighbors

In [29]:
# Grab a dataframe from Data cleaner folder and only import columns necessary for analyzing a user's repositories
df = pd.read_csv('../Data/clean/allReposCleaned.csv', usecols=['owner_user', 'name', 'description', 'language'])
df.head()

Unnamed: 0,name,owner_user,description,language
0,beforerender,spejman,Add hook like afterfilter runs action rendered,Ruby
1,bliptv,spejman,Ruby library Bliptv API,Ruby
2,bn4r,spejman,Bayesian networks Ruby,Ruby
3,cachemoney,spejman,WriteThrough Cacheing Library ActiveRecord,Ruby
4,contacts,spejman,universal interface import email contacts prov...,Ruby


In [30]:
# count unique languges
df['language'].nunique()

# Create list of unique languages with _ prefix
languages = ['_' + language for language in df['language'].unique()]

# one hot encode the languages and don't include the language prefix
df = pd.get_dummies(df, columns=['language'], prefix='')

In [31]:
# Turn df into a repo specific df with owner_user as a unique identifier, appending description and keeping 1 if any of the languages are present in at least one repo

# Create a dictionary for aggregation
aggregation_dict = {
    'name': lambda x: list(x),
    'description': lambda x: list(x)
}

# Add columns for languages
for lang in languages:
    aggregation_dict[lang] = 'max'

# Group by 'owner_user' and aggregate
user_df = df.groupby('owner_user').agg(aggregation_dict).reset_index()

# Display the first few rows of the resulting DataFrame
user_df.head()

Unnamed: 0,owner_user,name,description,_Ruby,_Elixir,_Go,_Shell,_Objective-C,_Dockerfile,_Python,...,_Logos,_JetBrains MPS,_LabVIEW,_Opa,_LOLCODE,_PigLatin,_M,_QML,_Macaulay2,_DTrace
0,0voice,[interviewinternalreference],[2023],0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0x00b1,"[anelle, mira6vsb, naparifeatures, permission,...","[cycleaccurate Nintendo Game Boy emulator, mir...",0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,18F,"[10xduxapp, 10xduxvulseval, 10xMeL, 10xMLaaS, ...",[10x Dependency Upgrades eXample App backgroun...,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,22388o,[21lessonsbook],[21 Lessons book bitcoin journey written bitco...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,233boy,[v2ray],[V2Ray],0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# first we turn list of names and descriptions into a single string
user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')
user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')
user_df.head()

Unnamed: 0,owner_user,name,description,_Ruby,_Elixir,_Go,_Shell,_Objective-C,_Dockerfile,_Python,...,_Logos,_JetBrains MPS,_LabVIEW,_Opa,_LOLCODE,_PigLatin,_M,_QML,_Macaulay2,_DTrace
0,0voice,interviewinternalreference,2023,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0x00b1,anelle mira6vsb naparifeatures permission pyto...,cycleaccurate Nintendo Game Boy emulator mira6...,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,18F,10xduxapp 10xduxvulseval 10xMeL 10xMLaaS 10xst...,10x Dependency Upgrades eXample App background...,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,22388o,21lessonsbook,21 Lessons book bitcoin journey written bitcoiner,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,233boy,v2ray,V2Ray,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Embedding name and description

In [33]:
# Load pre-trained Word2Vec model. Word Embeddings for the Software Engineering Domain, pre-trained on 15GB of Stack Overflow posts
# Citation: Efstathiou Vasiliki, Chatzilenas Christos, & Spinellis Diomidis. (2018). Word Embeddings for the Software Engineering Domain [Data set]. Zenodo. https://doi.org/10.5281/zenodo.1199620
word_vect = KeyedVectors.load_word2vec_format("/Users/mirandadrummond/VSCode/Github-Recommendation-System/monkelib/PretrainedModels/SO_vectors_200.bin", binary=True)

In [34]:
# Text preprocessing
embedded_user_df = user_df.copy()
embedded_user_df['name'] = user_df['name'].fillna('')  
embedded_user_df['description'] = user_df['description'].fillna('')

# Vectorizing name and description
def vectorize_text(text):
    vector_sum = np.zeros(word_vect.vector_size)  # Initialize an array to store the sum of word vectors
    count = 0  # Initialize a count to keep track of the number of words found in the vocabulary
    for word in text.split():
        if word in word_vect.key_to_index:  # Check if the word is in the vocabulary
            vector_sum += word_vect[word]  # Add the word vector to the sum
            count += 1  # Increment the count
    if count > 0:
        return vector_sum / count  # Return the average of word vectors
    else:
        return vector_sum  # Return the zero vector if no words are found in the vocabulary

embedded_user_df['name_vector'] = embedded_user_df['name'].apply(vectorize_text)
embedded_user_df['description_vector'] = embedded_user_df['description'].apply(vectorize_text)

embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)

In [35]:
# Transform df into something that KNN can use. To be more specific, into a feature matrix
# Create a list of all the vectors
vectors = []
repo_df = embedded_user_df * 1 # convert all boolean values in repo_df to 0 or 1

for row in repo_df.index: 
    vector = []
    for columns in ['name_vector', 'description_vector']:
        if type(repo_df.at[row, columns]) == np.ndarray:
            for element in repo_df.at[row, columns]:
                vector.append(element)
        else: vector.append(repo_df.at[row, columns])
    vectors.append(vector)

In [36]:
# Train Nearest Neighbors Model
k = 5  # Number of neighbors to find
nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')
nn_model.fit(vectors)

In [37]:
target_user = 21
# neighbors excluding the target user
neighbors = nn_model.kneighbors([vectors[target_user]], return_distance=False)[0][1:]
neighbors

array([ 682,  534, 2465, 2212])

In [38]:
# Display users that are similar to the target user and the target user itself (just for checking purposes)
neighborsAndTarget = [target_user] + list(neighbors)
user_df.iloc[neighborsAndTarget]

Unnamed: 0,owner_user,name,description,_Ruby,_Elixir,_Go,_Shell,_Objective-C,_Dockerfile,_Python,...,_Logos,_JetBrains MPS,_LabVIEW,_Opa,_LOLCODE,_PigLatin,_M,_QML,_Macaulay2,_DTrace
21,AntiTyping,emacs24d activeadmin Adv360ProZMK AlgorithmsNY...,fromscratch rebuild Emacs configuration admini...,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
682,brynary,activeadmin activemerchant actsastaggableon ar...,administration framework Ruby Rails applicatio...,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
534,awt,activeshipping actsastaggableon bips bitaddres...,Shipping API extension Active Merchant tagging...,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2465,raybaxter,activeadmin annotatemodels attrencrypted backb...,administration framework Ruby Rails applicatio...,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2212,noahd1,alsovalidates authlogic bootstrapsass brakeman...,Validate associated models aggregate errors pr...,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
vectors[target_user]

[0.36239884498839575,
 1.7800600888828437,
 0.11040757906933625,
 -0.18277396354824305,
 -0.38476044913598645,
 0.05976299180959662,
 0.3841868806630373,
 -1.3769227220133569,
 -0.7326740771532059,
 1.65326497827967,
 -0.6250529189904531,
 0.16083154920488596,
 0.4630897023404638,
 -0.11271768560012181,
 -0.09815884366010626,
 0.05095272045582533,
 0.28760645538568497,
 1.266640555113554,
 -0.6410647180552284,
 0.23682562541216612,
 0.528375040118893,
 0.6166720371693373,
 0.4663150170041869,
 -1.3802896259973447,
 0.45014819254477817,
 2.135907023989906,
 0.6111818455780546,
 0.2663284770678729,
 0.05846409127116203,
 1.2113076249758403,
 -0.4399104528129101,
 0.5933582515766224,
 -1.6607853521903355,
 0.03151530275742213,
 -0.7997798432285587,
 0.4605408962816,
 -0.46937740097443265,
 0.5349265206605196,
 0.15905654958138862,
 -1.1740470211952925,
 0.2831584687034289,
 0.2716315630823374,
 0.5975454918419322,
 0.6873048953711987,
 -0.5413679064561924,
 0.947327700133125,
 -0.24364349

# Find new repo to recommend with low similarity to the ones of the target user 