In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load the data
data = pd.read_csv('../Data/clean/allReposCleaned.csv')

In [3]:
# delete missing values
data = data.dropna()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Handling missing values in text columns
data['description'].fillna('', inplace=True)
data['name'].fillna('', inplace=True)
data['language'].fillna('', inplace=True)

# Concatenating the text columns for vectorization
text_data = data['name'] + " " + data['description'] + " " + data['language']

# Vectorizing the text data
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

# Calculating cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Since the goal is to give a score to each repo, we can average the cosine similarities for each repo
# This gives a single score representing how similar each repo's text data is to all other repos
similarity_scores = np.mean(cosine_sim, axis=1)

# Adding the new column to the dataset
data['cosine_similarity_score'] = similarity_scores

# Displaying the updated dataset with the new column
data[['name', 'description', 'language', 'cosine_similarity_score']].head(100)

Unnamed: 0,name,description,language,cosine_similarity_score
0,publicapis,collective list free APIs,Python,0.004864
1,systemdesignprimer,Learn design largescale systems Prep system de...,Python,0.004025
2,awesomepython,curated list awesome Python frameworks librari...,Python,0.008501
3,Python,Algorithms implemented Python,Python,0.013527
4,Python100Days,Python 100,Python,0.008451
...,...,...,...,...
95,zxing,ZXing Zebra Crossing barcode scanning library ...,Java,0.004575
96,springbootdemo,Spring Boot,Java,0.002977
97,selenium,browser automation framework ecosystem,Java,0.005545
98,termuxapp,Termux terminal emulator application Android O...,Java,0.004424


In [5]:
def recommend_repos(user_preference, data, tfidf_vectorizer, top_n=10):
    """
    Recommend repositories based on user preferences.
    
    Parameters:
    user_preference (str): The user's preference in text form.
    data (DataFrame): The dataset containing repository information.
    tfidf_vectorizer (TfidfVectorizer): A fitted TF-IDF vectorizer.
    top_n (int): Number of top recommendations to return.
    
    Returns:
    DataFrame: Top n recommended repositories, with a cleaner output.
    """
    # Vectorize the user preference
    user_pref_vector = tfidf_vectorizer.transform([user_preference])

    # Calculate cosine similarity with all repositories
    cosine_scores = cosine_similarity(user_pref_vector, tfidf_vectorizer.transform(data['name'] + " " + data['description'] + " " + data['language'])).flatten()

    # Get the indices of the repositories with the highest similarity scores
    top_indices = np.argsort(cosine_scores)[-top_n:][::-1]

    # Select the top n recommended repositories and reset the index
    recommended_repos = data.iloc[top_indices].reset_index(drop=True)

    # Optionally, format the output to make it more readable
    # For example, only displaying certain columns
    return recommended_repos[['name', 'description', 'language', 'cosine_similarity_score']]


In [7]:
data = data.drop_duplicates(subset=['name'], keep='first')

In [8]:
# Example usage:
user_preference = "python"
recommended_repos = recommend_repos(user_preference, data, tfidf_vectorizer, top_n=10)
print(recommended_repos)

                    name                    description language  \
0                 Python  Algorithms implemented Python   Python   
1             Battleship             Simple Python Game   Python   
2                cpython    Python programming language   Python   
3                redispy            Redis Python Client   Python   
4  pythonprojecttemplate        Python Project Template   Python   
5          Python100Days                     Python 100   Python   
6           CanIStreamIt              Stream python api   Python   
7            pythonutils                   python utils   Python   
8               pystache                Mustache Python   Python   
9                 ginkgo  Python service microframework   Python   

   cosine_similarity_score  
0                 0.013527  
1                 0.012885  
2                 0.010951  
3                 0.011054  
4                 0.011023  
5                 0.008451  
6                 0.011091  
7                 0