# **Content Based Filtering Recommendation System Using Neural Networks**

Import all the necessary libraries

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity

Import the dataset

In [12]:
df = pd.read_csv('oss_data.csv')
df.fillna(method='ffill', inplace=True)
df

Unnamed: 0,name,desc,site,tags,Unnamed: 4,upforgrabs__link,stats__issue-count,Unnamed: 7,Unnamed: 8
0,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,.net,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
1,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,c#,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
2,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,powershell,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
3,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,security,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
4,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,aws,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
...,...,...,...,...,...,...,...,...,...
6247,Academico,"A simple, Laravel-based school management plat...",https://academico.site,vuejs,,https://github.com/academico-sis/academico/lab...,10,,2023-02-07T13:20:04Z
6248,Accessibility Insights for Web,Accessibility Insights for Web is a Google Chr...,https://accessibilityinsights.io,chrome-extension,,https://github.com/microsoft/accessibility-ins...,7,,2023-02-07T13:20:04Z
6249,Accessibility Insights for Web,Accessibility Insights for Web is a Google Chr...,https://accessibilityinsights.io,reactjs,,https://github.com/microsoft/accessibility-ins...,7,,2023-02-07T13:20:04Z
6250,Accessibility Insights for Web,Accessibility Insights for Web is a Google Chr...,https://accessibilityinsights.io,typescript,,https://github.com/microsoft/accessibility-ins...,7,,2023-02-07T13:20:04Z


Combining the different attributes of the dataset into a single string

In [3]:
# Combining the different attributes of the dataset into a single string
df['content'] = df['name'].astype(str) + ' ' + df['desc'].astype(str) + ' ' + df['tags'] + ' ' + df['upforgrabs__link'].astype(str)
df['content'] = df['content'].fillna('')
df['content']

0       ACMESharp .NET Library and PowerShell module f...
1                                          nan nan c# nan
2                                  nan nan powershell nan
3                                    nan nan security nan
4                                         nan nan aws nan
                              ...                        
6247                                    nan nan vuejs nan
6248    Accessibility Insights for Web Accessibility I...
6249                                  nan nan reactjs nan
6250                               nan nan typescript nan
6251                            nan nan accessibility nan
Name: content, Length: 6252, dtype: object

Tokenize content for Word2Vec

In [4]:
# Tokenize content for Word2Vec
df['tokenized_content'] = df['content'].apply(simple_preprocess)
df['tokenized_content']

0       [acmesharp, net, library, and, powershell, mod...
1                                         [nan, nan, nan]
2                             [nan, nan, powershell, nan]
3                               [nan, nan, security, nan]
4                                    [nan, nan, aws, nan]
                              ...                        
6247                               [nan, nan, vuejs, nan]
6248    [accessibility, insights, for, web, accessibil...
6249                             [nan, nan, reactjs, nan]
6250                          [nan, nan, typescript, nan]
6251                       [nan, nan, accessibility, nan]
Name: tokenized_content, Length: 6252, dtype: object

Training the Word2Vec model

In [5]:
#Training the Word2Vec model
model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
model.build_vocab(df['tokenized_content'])
model.train(df['tokenized_content'], total_examples=model.corpus_count, epochs=10)


(282223, 494250)

Function to average word vectors for a text

In [6]:
# Function to average word vectors for a text - computes avg word vectors for a given text using trained Word2Vec model
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

Function to compute average word vectors for all repos

In [7]:
# Function to compute average word vectors for all repos - applies avg word vectors function to a collection of text
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    return np.array(features)

Compute average word vectors for all repos

In [8]:
# Compute average word vectors for all repos
w2v_feature_array = averaged_word_vectorizer(corpus=df['tokenized_content'], model=model, num_features=100)

Processing & Output

In [31]:
# Get the user input
user_oss = input("Enter a Tag or the name of the repo: ")

oss_index1 = np.nan
oss_index2 = np.nan

if ((df['tags'] == user_oss).any()): 
    oss_index1 = df.loc[df['tags'] == user_oss].index[0]
else:
    oss_index2 = df.loc[df['name'] == user_oss].index[0]
    
if not np.isnan(oss_index1) and not np.isnan(oss_index2):
     oss_index = oss_index1 + oss_index2

# Compute the cosine similarities between the user repo and all other repo
user_oss_vector = w2v_feature_array[oss_index].reshape(1, -1)
similarity_scores = cosine_similarity(user_oss_vector, w2v_feature_array)

# Get the top 10 most similar repos
similar_repos = list(enumerate(similarity_scores[0]))
sorted_similar_repos = sorted(similar_repos, key=lambda x: x[1], reverse=True)[1:20]

# Print the top 10 similar repos
for i, score in sorted_similar_repos:
    print("{}: {}: {}: {}".format(i, df.loc[i, 'tags'], df.loc[i, 'name'], df.loc[i, 'upforgrabs__link']))

Enter a Tag or the name of the repo: activist
1419: aws: DynamoDb.SQL: https://github.com/fsprojects/DynamoDb.SQL/labels/up-for-grabs
2516: aws: gojwtcognito: https://github.com/bruno-chavez/gojwtcognito/labels/good%20first%20issue
2946: aws: LogAgent: https://github.com/sagar-arora/LogAgent/labels/good%20first%20issue
5179: aws: s3backup: https://github.com/dnnrly/s3backup/labels/help%20wanted
5183: aws: S3Provider: https://github.com/fsprojects/S3Provider/labels/up-for-grabs
2833: group: libnss_aad: https://github.com/aad-for-linux/libnss-aad/labels/help%20wanted
5502: private: SparkleShare: https://github.com/hbons/SparkleShare/labels/up-for-grabs
2844: languages: LibreLingo: https://github.com/LibreLingo/LibreLingo/labels/good%20first%20issue
2079: extensions: gun extensions: https://github.com/gundb/feature-requests/labels/up-for-grabs
4899: extensions: Ramda Adjunct: https://github.com/char0n/ramda-adjunct/labels/help%20wanted
3517: messaging: notify: https://github.com/nikoksr/n