In [2]:
import os
import sys

import pandas as pd
from typing import Tuple
from pandas import DataFrame
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

sys.path.append('../../../')
from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive
from codecompasslib.models.model_diff_repos import load_word2vec_model, vectorize_text

### Word2Vec
Word2Vec is a method that converts words into numerical vectors, capturing information about their meaning based on the context in which they appear.

Here’s how it works:

Initialize a vector for each word randomly.
For each word in the corpus:
Predict the context words (words nearby) given the target word (skip-gram).
Adjust the word vectors to minimize the prediction error.
The learned vectors represent the words’ meanings. These vectors can be used for tasks like document similarity, text classification, and information retrieval.

In our project we use pre-trained word2vec model, specifically trained on software engineering domain.

### K-Nearest Neighbors (KNN)
KNN is a supervised learning algorithm primarily used for classification based on the similarity of data points. It assumes that similar things tend to be close to each other in the feature space.

Distance Metric: To measure similarity, we compute the distance between data points. Commonly used metrics include Euclidean distance, Manhattan distance, or cosine similarity.
Prediction: Given a new data point, find its K nearest neighbors based on the chosen distance metric.
In our project we use KNN to find users most similar to our target user.

In [23]:
full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'

df_non_embedded = download_csv_as_pd_dataframe(creds=get_creds_drive(), file_id=full_data_folder_id)


Download 11%.

Download 23%.

Download 35%.

Download 47%.

Download 59%.

Download 71%.

Download 83%.

Download 95%.

Download 100%.


  return read_csv(fh)


In [51]:
usecols=['owner_user', 'name', 'description', 'language']
# drop every column except for these
df = df_non_embedded.copy()
df = df[usecols]

In [52]:
df.dropna()

Unnamed: 0,owner_user,name,description,language
4,Rameshwar0852,Automation_Project,Automated Bash Script to automate log Backup g...,Shell
18,Rameshwar0852,IKON,CBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION...,Python
19,Rameshwar0852,javamavensonarrgohelmk8,No description,HTML
23,Rameshwar0852,node001,files_repo,JavaScript
24,Rameshwar0852,nodeandjs,node java script application,JavaScript
...,...,...,...,...
2583820,pinax,pinax-blog,a blog app for Django,Python
2583821,montylounge,django-mingus,a Django blog engine leveraging reusable apps ...,JavaScript
2583822,WuXianglong,GeekBlog,A full blog system based on Django,JavaScript
2583823,NARKOZ,hacker-scripts,Based on a true story,JavaScript


In [53]:
# convert language column to string type
df['language'] = df['language'].astype(str)

In [54]:
# Create list of unique languages with _ prefix
languages = ['_' + language for language in df['language'].unique()]

# one hot encode the languages and don't include the language prefix
df = pd.get_dummies(df, columns=['language'], prefix='')

# Turn df into a repo specific df with owner_user as a unique identifier, appending description and keeping 1 if any of the languages are present in at least one repo

# Create a dictionary for aggregation
aggregation_dict = {
    'name': lambda x: list(x),
    'description': lambda x: list(x)
}

# Add columns for languages
for lang in languages:
    aggregation_dict[lang] = 'max'

# Group by 'owner_user' and aggregate
user_df = df.groupby('owner_user').agg(aggregation_dict).reset_index()

# Display the first few rows of the resulting DataFrame
user_df.head()

# first we turn list of names and descriptions into a single string
user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')
user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')
user_df.head()

word_vect = load_word2vec_model

# Text preprocessing
embedded_user_df = user_df.copy()
embedded_user_df['name'] = user_df['name'].fillna('')  
embedded_user_df['description'] = user_df['description'].fillna('')


word2vec_model = load_word2vec_model()
embedded_user_df['name_vector'] = embedded_user_df['name'].apply(lambda x: vectorize_text(x, word2vec_model))
embedded_user_df['description_vector'] = embedded_user_df['description'].apply(lambda x: vectorize_text(x, word2vec_model))
embedded_user_df
# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)

File not found.


AttributeError: 'NoneType' object has no attribute 'vector_size'

In [None]:
# Transform df into something that KNN can use. To be more specific, into a feature matrix
# Create a list of all the vectors
vectors = []
repo_df = embedded_user_df * 1 # convert all boolean values in repo_df to 0 or 1

for row in repo_df.index: 
    vector = []
    for columns in ['name_vector', 'description_vector']:
        if type(repo_df.at[row, columns]) == np.ndarray:
            for element in repo_df.at[row, columns]:
                vector.append(element)
        else: vector.append(repo_df.at[row, columns])
    vectors.append(vector)

    # Train Nearest Neighbors Model
k = 5  # Number of neighbors to find
nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')
nn_model.fit(vectors)

In [None]:
# Example Usage

target_user = 21
# neighbors excluding the target user
neighbors = nn_model.kneighbors([vectors[target_user]], return_distance=False)[0][1:]
neighbors