In [4]:
import os
import sys

import pandas as pd
from typing import Tuple
from pandas import DataFrame
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

sys.path.append('../../../')
from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive
from codecompasslib.models.model_diff_repos import load_word2vec_model, vectorize_text

In [5]:
def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:
    """
    Load the data from the Google Drive
    :return: The non-embedded and embedded datasets
    """
    DRIVE_ID = "0AL1DtB4TdEWdUk9PVA"
    DATA_FOLDER = "13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx"

    creds = get_creds_drive()
    df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
    df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)

    # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data
    # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')
    # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')

    print("Data loaded")
    return df_non_embedded, df_embedded

In [6]:
full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'
full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'

df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/mirandadrummond/VSCode/CodeCompass/codecompasslib/models/examples/../../../secrets/token.json'

In [None]:
def load_and_clean_data(df_non_embedded):
    """
    Load and clean the dataset from a specified filepath.
    
    Args:
        filepath (str): The file path to the dataset.

    Returns:
        pandas.DataFrame: The cleaned DataFrame.
    """
    # Load the data
    df = df_non_embedded

    # Delete missing values
    df.dropna(inplace=True)

    # Delete columns that are not needed
    columns_to_drop = [
        'is_archived', 'is_disabled', 'is_template', 'has_projects',  
        'owner_type', 'has_pages', 'has_wiki', 
        'has_issues', 'has_downloads', 'is_fork'
    ]
    df.drop(columns=columns_to_drop, inplace=True)

    # Handling missing values in text columns
    df['description'].fillna('', inplace=True)
    df['name'].fillna('', inplace=True)
    df['language'].fillna('', inplace=True)

    # Drop duplicates with name
    df.drop_duplicates(subset='name', keep='first', inplace=True)

    return df

In [None]:
df = load_and_clean_data(df_non_embedded)

# count unique languges
df['language'].nunique()

# Create list of unique languages with _ prefix
languages = ['_' + language for language in df['language'].unique()]

# one hot encode the languages and don't include the language prefix
df = pd.get_dummies(df, columns=['language'], prefix='')

# Turn df into a repo specific df with owner_user as a unique identifier, appending description and keeping 1 if any of the languages are present in at least one repo

# Create a dictionary for aggregation
aggregation_dict = {
    'name': lambda x: list(x),
    'description': lambda x: list(x)
}

# Add columns for languages
for lang in languages:
    aggregation_dict[lang] = 'max'

# Group by 'owner_user' and aggregate
user_df = df.groupby('owner_user').agg(aggregation_dict).reset_index()

# Display the first few rows of the resulting DataFrame
user_df.head()

# first we turn list of names and descriptions into a single string
user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')
user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')
user_df.head()

word_vect = load_word2vec_model

# Text preprocessing
embedded_user_df = user_df.copy()
embedded_user_df['name'] = user_df['name'].fillna('')  
embedded_user_df['description'] = user_df['description'].fillna('')

embedded_user_df['name_vector'] = embedded_user_df['name'].apply(vectorize_text)
embedded_user_df['description_vector'] = embedded_user_df['description'].apply(vectorize_text)
embedded_user_df
# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)

In [None]:
# Transform df into something that KNN can use. To be more specific, into a feature matrix
# Create a list of all the vectors
vectors = []
repo_df = embedded_user_df * 1 # convert all boolean values in repo_df to 0 or 1

for row in repo_df.index: 
    vector = []
    for columns in ['name_vector', 'description_vector']:
        if type(repo_df.at[row, columns]) == np.ndarray:
            for element in repo_df.at[row, columns]:
                vector.append(element)
        else: vector.append(repo_df.at[row, columns])
    vectors.append(vector)

    # Train Nearest Neighbors Model
k = 5  # Number of neighbors to find
nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')
nn_model.fit(vectors)

In [None]:
# Example Usage

target_user = 21
# neighbors excluding the target user
neighbors = nn_model.kneighbors([vectors[target_user]], return_distance=False)[0][1:]
neighbors