# **Content Based Filtering Recommendation System Using Neural Networks**

Import all the necessary libraries

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity

Import the dataset

In [2]:
df = pd.read_csv('oss_data.csv')
df

Unnamed: 0,# -----------------------------------------------------------,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,,,,,,,,,
1,"﻿""name""",desc,site,tags,,upforgrabs__link,stats__issue-count,,
2,ACMESharp,.NET Library and PowerShell module for the ACM...,https://github.com/ebekker/ACMESharp,.net,,https://github.com/ebekker/ACMESharp/labels/up...,0,,
3,,,,c#,,,,,
4,,,,powershell,,,,,
...,...,...,...,...,...,...,...,...,...
6249,,,,vuejs,,,,,
6250,Accessibility Insights for Web,Accessibility Insights for Web is a Google Chr...,https://accessibilityinsights.io,chrome-extension,,https://github.com/microsoft/accessibility-ins...,7,,
6251,,,,reactjs,,,,,
6252,,,,typescript,,,,,


Combining the different attributes of the dataset into a single string

In [3]:
# Combining the different attributes of the dataset into a single string
df['content'] = df['name'].astype(str) + ' ' + df['desc'].astype(str) + ' ' + df['tags'] + ' ' + df['upforgrabs__link'].astype(str)
df['content'] = df['content'].fillna('')
df['content']

KeyError: 'name'

Tokenize content for Word2Vec

In [4]:
# Tokenize content for Word2Vec
df['tokenized_content'] = df['content'].apply(simple_preprocess)
df['tokenized_content']

KeyError: 'content'

Training the Word2Vec model

In [5]:
#Training the Word2Vec model
model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
model.build_vocab(df['tokenized_content'])
model.train(df['tokenized_content'], total_examples=model.corpus_count, epochs=10)


KeyError: 'tokenized_content'

Function to average word vectors for a text

In [6]:
# Function to average word vectors for a text
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

Function to compute average word vectors for all repos

In [7]:
# Function to compute average word vectors for all repos
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    return np.array(features)

Compute average word vectors for all repos

In [8]:
# Compute average word vectors for all repos
w2v_feature_array = averaged_word_vectorizer(corpus=df['tokenized_content'], model=model, num_features=100)

KeyError: 'tokenized_content'

Processing & Output

In [10]:
# Get the user input
user_oss = input("Enter a repository: ")

# Find the index of the user movie
oss_index = df[df['name'] == user_oss].index[0]

# Compute the cosine similarities between the user movie and all other movies
user_oss_vector = w2v_feature_array[oss_index].reshape(1, -1)
similarity_scores = cosine_similarity(user_oss_vector, w2v_feature_array)

# Get the top 10 most similar movies
similar_repos = list(enumerate(similarity_scores[0]))
sorted_similar_repos = sorted(similar_repos, key=lambda x: x[1], reverse=True)[1:20]

# Print the top 10 similar repos
for i, score in sorted_similar_repos:
    print("{}: {}".format(i, df.loc[i, 'name']))

Enter a repository: activist
5708: Terasology
3084: Mautic
3850: OpenRCT2
3328: Mumble
2582: Keylime
5659: TcpDirectorySyncronizer
2794: leon
5692: Tenantee
4567: PMD
1856: FSharpRProvider
2924: LitmusChaos
5696: Tensorflow
532: Books API
119: Animate A Name
3638: OpenDota Core
5420: Sklean-genetic-opt
2061: Gryphon
4286: Opinionated Guides
2841: LibreLingo
