# Build a project/course recommendator by content similarity


## Install

In [46]:
# Install dependencies
import contractions
import numpy as np
import re
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/gildafernandez-
[nltk_data]     conchajahnsen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/gildafernandez-
[nltk_data]     conchajahnsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Data

In [47]:
# Load and understand the data
URL = 'https://drive.google.com/file/d/1Q3S5Tnm4KmAMZ8YzgLcfZVTa4PgrktZq/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
df = pd.read_csv(path)

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Project           30 non-null     object
 1   description       28 non-null     object
 2   more_information  25 non-null     object
dtypes: object(3)
memory usage: 848.0+ bytes


Unnamed: 0,Project,description,more_information
0,Editathon,An editathon or contribution marathon is an ev...,Wikipedia editathons take place in accredited ...
1,How to run an editathon,You will find many details on how to run an Ed...,An editathon can be: a scheduled time where pe...
2,Use the evaluation dashboard,The basic purpose of the dashboard is to provi...,
3,WikiVoyage,Wikivoyage is a free online world travel guide...,Wikivoyage's purpose is to create an updatable...
4,Wiki2map,It is a site which allows you to create mental...,Wiki2map is particularly useful for reading le...


In [48]:
# Concatenate text columns and add the full_description column in the df
df.fillna("", inplace=True)
df['full_description'] = df['description'].map(str) + " " + df['more_information'].map(str)
df.dropna(inplace=True)
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Project           30 non-null     object
 1   description       30 non-null     object
 2   more_information  30 non-null     object
 3   full_description  30 non-null     object
dtypes: object(4)
memory usage: 1.1+ KB


Unnamed: 0,Project,description,more_information,full_description
0,Editathon,An editathon or contribution marathon is an ev...,Wikipedia editathons take place in accredited ...,An editathon or contribution marathon is an ev...
1,How to run an editathon,You will find many details on how to run an Ed...,An editathon can be: a scheduled time where pe...,You will find many details on how to run an Ed...
2,Use the evaluation dashboard,The basic purpose of the dashboard is to provi...,,The basic purpose of the dashboard is to provi...
3,WikiVoyage,Wikivoyage is a free online world travel guide...,Wikivoyage's purpose is to create an updatable...,Wikivoyage is a free online world travel guide...
4,Wiki2map,It is a site which allows you to create mental...,Wiki2map is particularly useful for reading le...,It is a site which allows you to create mental...


## Pre-process data

In [49]:
# By default, NLTK (Natural Language Toolkit) includes a list of 40 stop words, including: “a”, “an”, “the”, “of”, “in”, etc. The stopwords in nltk are the most common words in data. They are words that you do not want to use to describe the topic of your content.
stop_words = nltk.corpus.stopwords.words("english")

In [50]:
# Define a function to normalize a form
def normalize_form(form):
    # Remove special characters
    form = re.sub(r'[^a-zA-Z0-9\s]', '', form, flags=re.I | re.A)
    # Turn all text in lower case
    form = form.lower()
    # Eliminate whitespaces
    form = form.strip()
    # Turn from contraction to non-contraction words
    form = contractions.fix(form)
    # Tokenize words in documents. Divide strings into lists of substrings.
    tokens = nltk.word_tokenize(form)
    # Remove stopwords tokens from the form
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Return a string by joining all the elements of the filtered_tokens, separated by the given separator " ".
    form = " ".join(filtered_tokens)
    return form

In [51]:
# Vectorize function evaluates pyfunc over successive tuples of the function normalize_form
normalize_form_vector = np.vectorize(normalize_form)

In [52]:
# Create list of each course with the normalized full_description
normalize_full_descriptions = normalize_form_vector(
    list(df["full_description"]))
len(normalize_full_descriptions)

normalize_full_descriptions

array(['editathon contribution marathon event contributors create edit improve articles certain theme subject specific type content order contribute projects wikipedia openstreetmap new contributors usually receive basic training collaborate projects wikipedia editathons take place accredited educational institutions scientific research institutions cultural institutions museums archives join editathon helps build encyclopedia provides access topic experts offline source materials builds relationships community encourages editors learn entices people become new wikipedians helps new wikipedians contribute fun source httpsenwikipediaorgwikiwikipediahowtorunaneditathon wikipedia editathon takes place accredited educational institutions scientific research institutions cultural institutions museums archives educational professional opportunities editathon helps improve european key competence 1 literacy competence 4 digital competence 5 personal social learning learn competence 8 cultural

## Engineer TF-IDF feature

###### TF-IDF (term frequency-inverse document frequency) is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.  
###### TFxIDF = (how many times a word appears in a document) (the inverse document frequency of the word across a set of documents)


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
# Convert a collection of raw documents to a matrix of TF-IDF features
tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(normalize_full_descriptions)
print(tfidf_matrix.shape)

# ngram_range = (1,2) means unigrams and bigrams can be extracted

(30, 411)


## Compute cosine document similarity

In [55]:
from sklearn.metrics.pairwise import cosine_similarity
form_sim = cosine_similarity(tfidf_matrix)
form_sim_df = pd.DataFrame(form_sim)
print(form_sim_df.head())


         0         1         2         3         4         5         6   \
0  1.000000  0.529567  0.023494  0.264008  0.158637  0.159473  0.373344   
1  0.529567  1.000000  0.022771  0.286354  0.141696  0.179899  0.188043   
2  0.023494  0.022771  1.000000  0.027500  0.000000  0.000000  0.012003   
3  0.264008  0.286354  0.027500  1.000000  0.279763  0.309864  0.328722   
4  0.158637  0.141696  0.000000  0.279763  1.000000  0.175940  0.389526   

         7         8         9   ...        20        21        22        23  \
0  0.171365  0.243195  0.286958  ...  0.068795  0.037962  0.060408  0.072983   
1  0.178891  0.183813  0.243049  ...  0.023741  0.018194  0.049607  0.050265   
2  0.000000  0.000000  0.000000  ...  0.000000  0.018117  0.006414  0.039397   
3  0.355136  0.354457  0.373701  ...  0.077943  0.056260  0.102791  0.107528   
4  0.330056  0.348568  0.272393  ...  0.020248  0.005773  0.030533  0.000000   

         24        25        26        27        28        29  
0  0

## Find top 5 similar courses

In [56]:
courses_list = df['Project'].values
courses_list, courses_list.shape


(array(['Editathon', 'How to run an editathon',
        'Use the evaluation dashboard', 'WikiVoyage', 'Wiki2map',
        'Wiki Science Competition', 'WMCH Map Service', 'Wikidata',
        'Wikibooks', 'Wiki Loves Monuments', 'Wikipedia', 'Commons',
        'UNESCO OER Recommendation 2019', 'Evaluation Dashboard',
        'A Guide to Wikiversity', 'Wikiversity', 'Wiktionary', 'Hackathon',
        'Catalan Sign Language and Wiktionary', 'Immunomedia',
        'WikiCamp Armenia', 'Translation apprentices and Wikipedia',
        'Wikipedia as a tool for university teaching',
        "University law students' editing",
        'Teacher training of teachers in Bolivia', 'Wikipedia in medicine',
        'Open Science Fellows Program', 'WikiDDHH',
        'Wikipedia in your university',
        'Reading Wikipedia in the Classroom'], dtype=object),
 (30,))

In [57]:
# Find course ID

course_idx = np.where(courses_list == "Editathon")[0][0]
print(course_idx)

course_similarities = form_sim_df.iloc[course_idx].values
print(course_similarities)


0
[1.         0.52956708 0.02349385 0.26400814 0.15863727 0.15947254
 0.3733439  0.17136452 0.24319545 0.28695811 0.29949306 0.206856
 0.         0.02289759 0.03150783 0.21058651 0.17795474 0.24991551
 0.         0.03085292 0.06879499 0.03796206 0.06040779 0.07298283
 0.06406586 0.06887256 0.17044121 0.11542374 0.12031022 0.04606885]


In [58]:
# Get top 5 similar course IDs

similar_course_idxs = np.argsort(-course_similarities)[1:6]
print(similar_course_idxs)

[ 1  6 10  9  3]


In [216]:
# Get top 5 similar course titles
similar_courses = courses_list[similar_course_idxs]
print(similar_courses)


['How to run an editathon' 'WMCH Map Service' 'Wikipedia'
 'Wiki Loves Monuments' 'WikiVoyage']


## Build the course recommendation function

In [217]:
def course_recommender(course_title, courses=courses_list, doc_sims=form_sim_df):
    # Find course ID
    course_idx = np.where(courses == course_title)[0][0]
    # Get course similarities
    course_similarities = doc_sims.iloc[course_idx].values
    # Get top 5 similar movie IDs
    similar_course_idxs = np.argsort(-course_similarities)[1:6]
    # Get top 5 courses
    similar_courses = courses[similar_course_idxs]
    #Return the top 5 courses
    return similar_courses


In [120]:
for course in courses_list:
    print("Course: ", course)
    print("Top 5 recommended courses: ", course_recommender(
        course_title=course, courses=courses_list, doc_sims=form_sim_df))
    print()


Course:  Editathon
Top 5 recommended courses:  ['How to run an editathon' 'WMCH Map Service' 'Wikipedia'
 'Wiki Loves Monuments' 'WikiVoyage']

Course:  How to run an editathon
Top 5 recommended courses:  ['Editathon' 'Wikipedia' 'WikiVoyage' 'Wiki Loves Monuments' 'Hackathon']

Course:  Use the evaluation dashboard
Top 5 recommended courses:  ['Evaluation Dashboard' 'Wikipedia in medicine'
 "University law students' editing" 'Commons' 'WikiVoyage']

Course:  WikiVoyage
Top 5 recommended courses:  ['Wikipedia' 'Wiki Loves Monuments' 'Wikidata' 'Wikibooks' 'Wiktionary']

Course:  Wiki2map
Top 5 recommended courses:  ['WMCH Map Service' 'Wikibooks' 'Wiktionary' 'Wikidata' 'Wikipedia']

Course:  Wiki Science Competition
Top 5 recommended courses:  ['Wikipedia' 'Wiki Loves Monuments' 'Wikibooks' 'Wikidata' 'Hackathon']

Course:  WMCH Map Service
Top 5 recommended courses:  ['Wikipedia' 'Wiki2map' 'Editathon' 'Wiki Loves Monuments' 'WikiVoyage']

Course:  Wikidata
Top 5 recommended courses: