# **Content Based Filtering Recommendation System Using Neural Networks**

Import all the necessary libraries

In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity

Import the dataset

In [None]:
df = pd.read_csv('oss_data.csv')
df.fillna(method='ffill', inplace=True)
df

Unnamed: 0,name,desc,site,tags,Unnamed: 4,upforgrabs__link,stats__issue-count,Unnamed: 7,Unnamed: 8
0,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,.net,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
1,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,c#,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
2,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,powershell,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
3,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,security,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
4,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,aws,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
...,...,...,...,...,...,...,...,...,...
6247,Academico,"A simple, Laravel-based school management plat...",https://academico.site,vuejs,,https://github.com/academico-sis/academico/lab...,10,,2023-02-07T13:20:04Z
6248,Accessibility Insights for Web,Accessibility Insights for Web is a Google Chr...,https://accessibilityinsights.io,chrome-extension,,https://github.com/microsoft/accessibility-ins...,7,,2023-02-07T13:20:04Z
6249,Accessibility Insights for Web,Accessibility Insights for Web is a Google Chr...,https://accessibilityinsights.io,reactjs,,https://github.com/microsoft/accessibility-ins...,7,,2023-02-07T13:20:04Z
6250,Accessibility Insights for Web,Accessibility Insights for Web is a Google Chr...,https://accessibilityinsights.io,typescript,,https://github.com/microsoft/accessibility-ins...,7,,2023-02-07T13:20:04Z


Combining the different attributes of the dataset into a single string

In [None]:
# Combining the different attributes of the dataset into a single string
df['content'] = df['name'].astype(str) + ' ' + df['desc'].astype(str) + ' ' + df['tags'] + ' ' + df['upforgrabs__link'].astype(str)
df['content'] = df['content'].fillna('')
df['content']

0       ACMESharp .NET Library and PowerShell module f...
1       ACMESharp .NET Library and PowerShell module f...
2       ACMESharp .NET Library and PowerShell module f...
3       ACMESharp .NET Library and PowerShell module f...
4       ACMESharp .NET Library and PowerShell module f...
                              ...                        
6247    Academico A simple, Laravel-based school manag...
6248    Accessibility Insights for Web Accessibility I...
6249    Accessibility Insights for Web Accessibility I...
6250    Accessibility Insights for Web Accessibility I...
6251    Accessibility Insights for Web Accessibility I...
Name: content, Length: 6252, dtype: object

Tokenize content for Word2Vec

In [None]:
# Tokenize content for Word2Vec
df['tokenized_content'] = df['content'].apply(simple_preprocess)
df['tokenized_content']

0       [acmesharp, net, library, and, powershell, mod...
1       [acmesharp, net, library, and, powershell, mod...
2       [acmesharp, net, library, and, powershell, mod...
3       [acmesharp, net, library, and, powershell, mod...
4       [acmesharp, net, library, and, powershell, mod...
                              ...                        
6247    [academico, simple, laravel, based, school, ma...
6248    [accessibility, insights, for, web, accessibil...
6249    [accessibility, insights, for, web, accessibil...
6250    [accessibility, insights, for, web, accessibil...
6251    [accessibility, insights, for, web, accessibil...
Name: tokenized_content, Length: 6252, dtype: object

Training the Word2Vec model

In [None]:
#Training the Word2Vec model
model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
model.build_vocab(df['tokenized_content'])
model.train(df['tokenized_content'], total_examples=model.corpus_count, epochs=10)


(1288520, 1763240)

Function to average word vectors for a text

In [None]:
# Function to average word vectors for a text - computes avg word vectors for a given text using trained Word2Vec model
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

Function to compute average word vectors for all repos

In [None]:
# Function to compute average word vectors for all repos - applies avg word vectors function to a collection of text
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    return np.array(features)

Compute average word vectors for all repos

In [None]:
# Compute average word vectors for all repos
w2v_feature_array = averaged_word_vectorizer(corpus=df['tokenized_content'], model=model, num_features=100)

Processing & Output

In [None]:
# Get the user input
user_oss = input("Enter a Tag or the name of the repo: ").strip().replace(" ", "") 

# accept input with spaces and remove spaces
oss_index = np.nan
if ((df['tags'] == user_oss).any()): 
    oss_index = df.loc[df['tags'] == user_oss].index[0]
else:
    oss_index = df.loc[df['name'] == user_oss].index[0] if ((df['name'] == user_oss).any()) else np.nan

if not np.isnan(oss_index):
    # Compute the cosine similarities between the user repo and all other repo
    user_oss_vector = w2v_feature_array[oss_index].reshape(1, -1)
    similarity_scores = cosine_similarity(user_oss_vector, w2v_feature_array)

    # Get the top 20 most similar repos
    similar_repos = list(enumerate(similarity_scores[0]))
    sorted_similar_repos = sorted(similar_repos, key=lambda x: x[1], reverse=True)[:50]

    # Print the top 20 similar repos
    printed_names = [] # List to keep track of printed names
    for i, score in sorted_similar_repos:
        name = df.loc[i, 'name']
        if name not in printed_names: # Check if name has already been printed
            tags = df.loc[i, 'tags']
            link = df.loc[i, 'upforgrabs__link']
            print("{}: {}: {}: {}".format(i, tags, name, link))
            printed_names.append(name) # Add name to printed names list
else:
    if not user_oss:
        print("Kindly enter the input without spaces or input shouldn't be empty.")
    else:
        print("No matching repository found.")



Enter a Tag or the name of the repo: web server
5563: webserver: StirFry: https://github.com/StirFry-js/stirfry/labels/help%20wanted
1569: video-editing: etro: https://github.com/etro-js/etro/labels/help%20wanted
1665: open-source: Freedomotic Vue Webapp: https://github.com/freedomotic/fd-vue-webapp/labels/help%20wanted
6020: user-interface: Vue.js: https://github.com/vuejs/vue/labels/contribution%20welcome
5753: node.js: The Lounge: https://github.com/thelounge/thelounge/labels/help%20wanted
1094: decentralized: CypherPoker.JS: https://github.com/monicanagent/cypherpoker.js/labels/help%20wanted
2362: node.js: Infisical: https://github.com/Infisical/infisical/labels/help%20wanted
3279: web: monument: https://github.com/ansble/monument/labels/first%20contribution
3244: javascript: mockingcase: https://github.com/strdr4605/mockingcase/labels/help%20wanted
45: node.js: AlaSQL: https://github.com/AlaSQL/alasql/labels/Help%20wanted
4460: lock: PatternLockJS: https://github.com/phenax/patter