# Duplicates suggestion tool for OpenEdu moderator

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import re
from sentence_transformers import SentenceTransformer, util

In [2]:
# Define cleaning function for text in description fields

def clean_text(text):
    '''Cleans text from brackets/symbols and non-ASCII characters'''
    
    text = text.encode("ascii", "ignore") # Remove non-ASCII characters
    text = text.decode()
    text = text.replace('\n', ' ') # Remove new row escape
    text = re.sub(r'<[^>]+>', '', text) # Eliminate text within < > characters (often contaning info on subs font/color)
    text = re.sub(r'\[[^]]+\]', '', text) # Eliminate text within squared brackets (often contaning audio description for hearing-impaired individuals)
    text = re.sub(r'\([^)]+\)', '', text) # Eliminate text within parentheses (often contaning audio description for hearing-impaired individuals)
    text = text.replace('&nbsp', '') # Remove HTML non-breaking space
    text = text.replace('&#39;', "'")# Decode HTML code for apostrophe
    text = text.replace('&quot;', '"')# Decode HTML code for quotation marks
    text = ' '.join(text.split()) # Reduce all double/triple whitespacing to single
    return text

In [3]:
# Load pretrained semantic model

def load_model(model_name):
    """Load pre-trained semantic search model."""
    model = SentenceTransformer(model_name)
    return model

In [4]:
# Define preprocessing function

def preprocess_text(df):
    '''Puts together text fields for each instance, cleans them, calculates embeddings,
    and makes them ready for the semantic similarity model. Also, it creates labels for each instance.'''
    # Define full body of text to analyse
    df['full_text'] = df['short_description_en'].astype(str) + df['more_information_en'].astype(str)
    
    # Create list of texts to feed SBERT model
    sentence_list = df['full_text'].tolist()
    
    # Clean text
    sentence_list_clean = [clean_text(element) for element in sentence_list]
    
    # Build label for each instance with title + author
    df['instance'] = df['title_en'].astype(str) + str(' by ') + df['by'].astype(str)
    
    # Export entries titles and links as series
    instances_series = pd.Series(df['instance']) 
    instances_links = pd.Series(df['link'])
    
    #Encode all texts in the database
    embeddings = model.encode(sentence_list_clean)
    
    return sentence_list_clean, instances_series, instances_links, embeddings

In [5]:
# Define similarity function to propose most similar suggestions by semantic similarity

def similarity_table(new_entry, instances_series, instances_links, embeddings):
    '''Computes text embeddings for new entry. Calculates the cosine similarity
    vector and shows the 3 most similar database entries to the new entry.'''
    
    # Encode text new entry
    new_embed = model.encode(new_entry)

    #Compute cosine similarity between new text & database
    cos_sim = util.cos_sim(new_embed, embeddings)

    # Put cos_sim in a Dataframe with labels and links
    similarity_vector_values = pd.DataFrame(cos_sim.numpy()).squeeze(axis=0)

    # Create table with cosine similarity, entries titles and links
    similarity_df = pd.concat([similarity_vector_values, instances_series, instances_links], axis=1).rename(columns={0 : 'similarity_vector_values'})

    # Sort by higher similarity score and show top 3
    result = similarity_df.sort_values('similarity_vector_values', ascending=False).head(3)
    
    return result

## Test

In [6]:
# Load OpenEdu data from SQL server (only columns that are needed)
import sqlalchemy as sa

database_link = sa.create_engine('postgresql://deploy_impact:AVNS_tEdPMnvmmI0knrjJe-R@deploy-impact-cg-chrisg-demo.aivencloud.com:24947/openedu')
df = pd.read_sql_query('SELECT title_en, by, link, short_description_en, more_information_en FROM sito_project', database_link)
df.head()

Unnamed: 0,title_en,by,link,short_description_en,more_information_en
0,Editathon,Wikimedia Community,https://it.wikipedia.org/wiki/Editathon,An editathon or contribution marathon is an ev...,<p>Wikipedia editathons take place in accredit...
1,Enhancing your knowledge in Wikipedia,Wikimedia France,,The goal is to guide PhD students or young res...,
2,How to run an editathon,Wikimedia Community,https://en.wikipedia.org/wiki/Wikipedia:How_to...,You will find many details on how to run an Ed...,<p>An editathon can be: a scheduled time where...
3,Use the evaluation dashboard,Wiki Edu,https://meta.wikimedia.org/wiki/Programs_%26_E...,The basic purpose of the dashboard is to provi...,
4,WikiVoyage,Wikimedia Community,https://www.wikivoyage.org,Wikivoyage is a free online world travel guide...,<p>Wikivoyage&#39;s purpose is to create an up...


In [7]:
# In case of data loading via CSV file
# df = pd.read_csv('OpenEdu_project_list.csv')
# df.info()
# display(df.head())

In [8]:
# Example new entry
new_entry = 'The free encyclopedia that anyone can edit. Wikipedia is created and maintained as an open collaboration project by a community of volunteer editors, using a wiki-based editing system. It is the largest and most popular general reference work on the World Wide Web, and is one of the 20 most popular websites in the world.'

# Load semantic search model to use
model = load_model('all-MiniLM-L6-v2')

# Preprocess data
sentence_list, title_list, link_list, data_embeddings = preprocess_text(df)

# Create similarity vector and top 3 most similar scores
similarity_table(new_entry, title_list, link_list, data_embeddings)

Unnamed: 0,similarity_vector_values,instance,link
11,0.745891,Wikipedia by Wikimedia Community,https://en.wikipedia.org
30,0.685963,Wikipedia in your university by Wikimedia Arge...,
0,0.625403,Editathon by Wikimedia Community,https://it.wikipedia.org/wiki/Editathon
