In [2]:
import os
import sys

import pandas as pd
from typing import Tuple
from pandas import DataFrame
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

sys.path.append('../../../')
from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive

### Cosine Similairty Model

This model utilizes the cosine similarity between the query and the documents to rank the documents. The cosine similarity is calculated as follows:

- Using NLP and TFIDF, the repository, language and its description are tokenized and vectorized.
- The cosine similarity is calculated.
- The repos are ranked based on the cosine similarity.

In [None]:
def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:
    """
    Load the data from the Google Drive
    :return: The non-embedded and embedded datasets
    """
    DRIVE_ID = "0AL1DtB4TdEWdUk9PVA"
    DATA_FOLDER = "13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx"

    creds = get_creds_drive()
    df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
    df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)

    # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data
    # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')
    # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')

    print("Data loaded")
    return df_non_embedded, df_embedded

In [None]:
full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'
full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'

df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)

In [None]:
def load_and_clean_data(df_non_embedded):
    """
    Load and clean the dataset from a specified filepath.
    
    Args:
        filepath (str): The file path to the dataset.

    Returns:
        pandas.DataFrame: The cleaned DataFrame.
    """
    # Load the data
    df = df_non_embedded

    # Delete missing values
    df.dropna(inplace=True)

    # Delete columns that are not needed
    columns_to_drop = [
        'is_archived', 'is_disabled', 'is_template', 'has_projects',  
        'owner_type', 'has_pages', 'has_wiki', 
        'has_issues', 'has_downloads', 'is_fork'
    ]
    df.drop(columns=columns_to_drop, inplace=True)

    # Handling missing values in text columns
    df['description'].fillna('', inplace=True)
    df['name'].fillna('', inplace=True)
    df['language'].fillna('', inplace=True)

    # Drop duplicates with name
    df.drop_duplicates(subset='name', keep='first', inplace=True)

    return df

In [None]:
def calculate_cosine_similarity_scores(df):
    """
    Calculate cosine similarity scores for the dataset.

    Args:
        df (pandas.DataFrame): The DataFrame containing repository data.

    Returns:
        tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.
    """
    # Concatenating the text columns for vectorization
    text_data = df['name'] + " " + df['description'] + " " + df['language']

    # Vectorizing the text data using TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

    # Calculating cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix)

    # Average the cosine similarities for each repo
    similarity_scores = np.mean(cosine_sim, axis=1)

    # Adding the new column to the dataset
    df['cosine_similarity_score'] = similarity_scores

In [None]:
def calculate_cosine_similarity_scores(df):
    """
    Calculate cosine similarity scores for the dataset.

    Args:
        df (pandas.DataFrame): The DataFrame containing repository data.

    Returns:
        tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.
    """
    # Concatenating the text columns for vectorization
    text_data = df['name'] + " " + df['description'] + " " + df['language']

    # Vectorizing the text data using TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

    # Calculating cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix)

    # Average the cosine similarities for each repo
    similarity_scores = np.mean(cosine_sim, axis=1)

    # Adding the new column to the dataset
    df['cosine_similarity_score'] = similarity_scores

    return df, tfidf_vectorizer

In [None]:
def recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10):
    """
    Recommend repositories based on user preferences.

    Args:
        user_preference (str): The user's preferred keywords or phrases.
        df (pandas.DataFrame): The DataFrame containing repository data.
        tfidf_vectorizer (TfidfVectorizer): The TF-IDF vectorizer used for transforming text data.
        top_n (int, optional): Number of top recommendations to return. Defaults to 10.

    Returns:
        pandas.DataFrame: DataFrame containing top_n recommended repositories.
    """
    # Vectorize the user preference
    user_pref_vector = tfidf_vectorizer.transform([user_preference])

    # Calculate cosine similarity with all repositories
    cosine_scores = cosine_similarity(user_pref_vector, tfidf_vectorizer.transform(df['name'] + " " + df['description'] + " " + df['language'])).flatten()

    # Get the indices of the repositories with the highest similarity scores
    top_indices = np.argsort(cosine_scores)[-top_n:][::-1]

    # Select the top n recommended repositories
    recommended_repos = df.iloc[top_indices].reset_index(drop=True)

    return recommended_repos[['name', 'description', 'language', 'cosine_similarity_score']]

In [None]:
def main(df):
    """
    Main function to run the script.
    """
    df, tfidf_vectorizer = calculate_cosine_similarity_scores(df)
    user_preference = "python"
    recommended_repos = recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10)
    print(recommended_repos)