In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load the data
data = pd.read_csv('../Data/clean/mostFamousUsersReposCleaned.csv')

In [3]:
# delete missing values
data = data.dropna()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Handling missing values in text columns
data['description'].fillna('', inplace=True)
data['name'].fillna('', inplace=True)
data['language'].fillna('', inplace=True)

# Concatenating the text columns for vectorization
text_data = data['name'] + " " + data['description'] + " " + data['language']

# Vectorizing the text data
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

# Calculating cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Since the goal is to give a score to each repo, we can average the cosine similarities for each repo
# This gives a single score representing how similar each repo's text data is to all other repos
similarity_scores = np.mean(cosine_sim, axis=1)

# Adding the new column to the dataset
data['cosine_similarity_score'] = similarity_scores

# Displaying the updated dataset with the new column
data[['name', 'description', 'language', 'cosine_similarity_score']].head(100)

Unnamed: 0,name,description,language,cosine_similarity_score
0,behaving,BehaviorDriverDevelopment multiuser webemailsm...,Python,0.003912
1,circlesofhell,Circles Hell VCV Rack 2 Module,C++,0.001264
2,firstgame,C ncurses game,C++,0.001225
3,GlobalPlatformPro,Manage applets keys JavaCards like pro command...,Java,0.005944
4,krot,Simulator karaoke rotation modeling,JavaScript,0.004031
...,...,...,...,...
95,chefcookbook,Development repository Chef cookbook RabbitMQ,Ruby,0.008680
96,chefhandlerdatadog,Chef stats events directly Datadog,Ruby,0.006047
97,chefletsencryptboulderserver,Chef cookbook Boulder ACMEbased server Lets En...,Ruby,0.006338
98,chefpostgresql,Chef cookbook PostgreSQL components,Ruby,0.007748


In [5]:
def recommend_repos(user_preference, data, tfidf_vectorizer, top_n=10):
    """
    Recommend repositories based on user preferences.
    
    Parameters:
    user_preference (str): The user's preference in text form.
    data (DataFrame): The dataset containing repository information.
    tfidf_vectorizer (TfidfVectorizer): A fitted TF-IDF vectorizer.
    top_n (int): Number of top recommendations to return.
    
    Returns:
    DataFrame: Top n recommended repositories, with a cleaner output.
    """
    # Vectorize the user preference
    user_pref_vector = tfidf_vectorizer.transform([user_preference])

    # Calculate cosine similarity with all repositories
    cosine_scores = cosine_similarity(user_pref_vector, tfidf_vectorizer.transform(data['name'] + " " + data['description'] + " " + data['language'])).flatten()

    # Get the indices of the repositories with the highest similarity scores
    top_indices = np.argsort(cosine_scores)[-top_n:][::-1]

    # Select the top n recommended repositories and reset the index
    recommended_repos = data.iloc[top_indices].reset_index(drop=True)

    # Optionally, format the output to make it more readable
    # For example, only displaying certain columns
    return recommended_repos[['name', 'description', 'language', 'cosine_similarity_score']]


In [6]:
# Example usage:
user_preference = "python"
recommended_repos = recommend_repos(user_preference, data, tfidf_vectorizer, top_n=10)
print(recommended_repos)

                    name                                        description  \
0             Battleship                                 Simple Python Game   
1                cpython                        Python programming language   
2                redispy                                Redis Python Client   
3  pythonprojecttemplate                            Python Project Template   
4           CanIStreamIt                                  Stream python api   
5            pythonutils                                       python utils   
6               pystache                                    Mustache Python   
7                 ginkgo                      Python service microframework   
8           alipaypython                                      alipay python   
9              HTTPretty  HTTP client mocking tool Python like rubys Fak...   

  language  cosine_similarity_score  
0   Python                 0.013152  
1   Python                 0.010621  
2   Python      