# Backend Task

In [1]:
# Import dataset from api.

import requests
res = requests.get('https://o136z8hk40.execute-api.us-east-1.amazonaws.com/dev/get-list-of-conferences')
if res.status_code != 200:
    raise Exception("ERROR : API request unsuccessful.")

main_data = res.json()


In [2]:
# Description of the given data.

print(main_data['display_paid'])
print(main_data['display_free'])
print(len(main_data['free']))
print(len(main_data['paid']))
print(len(main_data['free'][0]))
print(len(main_data['paid'][0]))


1
1
28
103
19
19


In [3]:
# free_paid_list contains all details of 131 conferences as dict.

free_paid_list = []

for test in main_data['free']:
    free_paid_list.append(test)

for test in main_data['paid']:
    free_paid_list.append(test)
    
print(len(free_paid_list))


131


### Formatting Conference Starting Date.

In [4]:
# Change confStart representation 06 Nov, 2019 ----> November 6th, 2019

from datetime import datetime
import dateparser

def suffix(d):
    return 'th' if 11<=d<=13 else {1:'st',2:'nd',3:'rd'}.get(d%10, 'th')

def custom_strftime(format, t):
    return t.strftime(format).replace('{S}', str(t.day) + suffix(t.day))


In [5]:
# Update confStart in same human readable format.

for test in free_paid_list:
    st = dateparser.parse(test['confStartDate'])
    test['confStartDate'] = custom_strftime('%B {S}, %Y', st)
    

### Formatting Venue of the Conference according to requirement.

In [6]:
import re

required_keys = ['city', 'state', 'country', 'venue', 'confName', 'confStartDate', 'entryType', 'confUrl']

for test in free_paid_list:
    test['city'] = test['city'].strip()
    test['state'] = test['state'].strip()
    test['country'] = test['country'].strip()
    test['venue'] = test['venue'].strip()
    
    if re.search(test['city'], test['venue']) == None:
        if test['state'] == "":
            test['venue'] = ', '.join([test['venue'], test['city'], test['country']])
        else:
            test['venue'] = ', '.join([test['venue'], test['city'], test['state'], test['country']])
    elif re.search(test['state'], test['venue']) == None:
        test['venue'] = ', '.join([test['venue'], test['state'], test['country']])
    elif re.search(test['country'], test['venue']) == None:
        test['venue'] = ', '.join([test['venue'], test['country']])
    else:
        test['venue'] = test['venue']


### TASK - 1 Printing the conference details in human readable format.

In [7]:
# For printing conference details in human readable format <confName, confStartDate, venue, entryType, confUrl>

for test in free_paid_list:
    print(f'"{test["confName"]}", {test["confStartDate"]}, {test["venue"]}, {test["entryType"]}.')
    print(test['confUrl'])
    print()


"AWS Innovate Online Conference", February 19th, 2020, Online, Free.
https://aws.amazon.com/events/aws-innovate/machine-learning/

"Redis Day Bangalore", January 21st, 2020, Taj Yeshwantpur, Bengaluru, 2275, Tumkur Road, Yeshwanthpur Industrial Area, Phase 1, Yeshwantpur, Bengaluru, Karnataka 560022, India, Bangalore, Karnataka, India, Free.
https://connect.redislabs.com/redisdaybangalore

"WineWorks", January 22nd, 2020, San Francisco, CA, USA, California, USA, Free.
https://www.papercall.io/osqueryatscale-cfp

"AWS Community Day", January 31st, 2020, Stockholm, Sweden, Swedan, Free.
https://awscommunitynordics.org/communityday/

"International Conference On Internet Of Things Big Data Analytics And Information Technology", February 1st, 2020, Chennai, Tamil Nadu, India, Free.
https://www.allconferencealert.com/event-detail.html?ev_id=357894&eventname=international-conference-on-internet-of-things-big-data-analytics-and-information-technology(icitbdit--2020)

"International Conference

### TASK - 2 Identify exact duplicates.

In [8]:
# Identify exact duplicates(if any).

dup_ls = [ele for i, ele in enumerate(free_paid_list) if ele in free_paid_list[i+1:]]

print("Number of Exact Duplicate : ", len(dup_ls))
print()
print("-----------Details of Duplicates-----------")

for ele in dup_ls:
    for key, val in ele.items():
        if val == '':
            print(key, ': **')   # Empty string is replaces by **
        else:
            print(key, ' : ', val)


Number of Exact Duplicate :  1

-----------Details of Duplicates-----------
emailId : **
city  :  Mumbai
twitter_handle : **
user_id  :  1579602074
country  :  India
imageURL  :  https://storage.googleapis.com/konfhub-bd9c9.appspot.com/80835.jpg?Expires=4733720417&GoogleAccessId=firebase-adminsdk-r3qh4%40konfhub-bd9c9.iam.gserviceaccount.com&Signature=G1FMqYp9fJsgE7dEDLp4S7jPQ6Ysx06C2j%2FlMcVqQD2xxULY0O83KSng2a5SWK69HSSCsJyAZGdmJOkZwqXfYCEzK9YxGAObDd%2F4FBXJK8hgW2%2FKnUcpxjNWWkF9SaqYO22cmk0A6e5A7W%2BIworwz5Ev4Ct4%2Bfy%2BizjvgaH6GtEu0rtRCnuLnvu1WQtMns%2BHZiDPTEpEk3DHw3B%2B5lP0JFuvJfEpFE6ZgVoG3Y%2FZweAsSYTCeW%2FT9bgz35yNhLbLS4cTeBvSmEynmz%2FYvSfWD6zBzHfX3v8Y10tB%2BMb3PMLrV56L2APFolF39zBWvp1BzuSbo8dTwDdasL24bxxV9A%3D%3D
venue  :  Bombay Exhibition Centre NESCO, Goregaon East, Mumbai, Maharashtra 400063, India
searchTerms  :  AWS Summit Mumbai, cloud computing,AWS,DevOps, April, Paid, 1580120419,  India
confName  :  AWS Summit Mumbai
state  :  Maharashtra
long  :  72.877426
confEndDate  : 

### TASK - 3 Identify semantic duplicates.
(i.e., the conferences are same but the details provided are slightly different, e.g., “React Conference 2019” in one entry and “ReactConf ‘19” in another entry but the other fields are same or similar). 

In [9]:
# Analyse semantic duplicates.

import pandas as pd
conf_name = []

for ele in free_paid_list:
    conf_name.append(ele['confName'])
    
for i, k in enumerate(conf_name):
    print(i+1,' : ', k)

1  :  AWS Innovate Online Conference
2  :  Redis Day Bangalore
3  :  WineWorks
4  :  AWS Community Day
5  :  International Conference On Internet Of Things Big Data Analytics And Information Technology
6  :  International Conference On Big Data IoT Cyber Security And Information Technology
7  :  ChiBrrCon 2020
8  :  DATACLOUD INDIA 2020
9  :  NIC 20 20 Vision Edition
10  :  PyCascades 2020
11  :  BSidesSF 2020
12  :  PyTennessee 2020
13  :  WeRockIT Developers Conference 2020
14  :   Connectaha 2020
15  :  DevOpsDays Vancouver 2020
16  :  Microsoft Ignite The Tour 2019 2020
17  :   NET Fwdays
18  :  403 Forbidden
19  :   PHP Fwdays
20  :  Devopscon
21  :  Amazon Alexa VOICE
22  :  Voice Meetup: what3words edition with Speechmatics | Meetup
23  :  Cambridge Alexa Developers Meetup
24  :  Amazon Alexa Paris
25  :  Women in Data Science Pune Conference
26  :  MATLAB EXPO 2020 India
27  :  MATLAB EXPO 2020 India
28  :  MATLAB EXPO 2020 India
29  :  Antarcticonf The Conference At The Edge O

In [10]:

'''
Implementation of Soft Cosine Similarity to find the Semantic duplicates 
using pretrained model 'fasttext-wiki-news-subwords-300'.
''' 

import gensim
from gensim.models import WordEmbeddingSimilarityIndex 
from gensim.similarities import SparseTermSimilarityMatrix
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
print(gensim.__version__)


3.8.3


In [11]:
# Download the FastText pretrained model.

fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')


In [12]:
# Prepare a dictionary and a corpus.

dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in conf_name])


In [13]:
# Prepare the similarity matrix

similarity_index = WordEmbeddingSimilarityIndex(fasttext_model300)
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary)


In [14]:
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Function for finding soft cosine similarity matrix.

def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array)
    cossim_mat = pd.DataFrame([[round(similarity_matrix.inner_product(sentences[i],sentences[j], normalized=True) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)])
    return cossim_mat


In [15]:
# Convert the sentences into bag-of-words vectors.
sentences = [dictionary.doc2bow(simple_preprocess(ele)) for ele in conf_name]

# Generate cosine similarity matrix to identify semantic similarity.
semantic_matrix = create_soft_cossim_matrix(sentences)


In [16]:
# Generated Similarity matrix is a Sparse Matrix.
semantic_matrix


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130
0,1.0,0.0,0.0,0.29,0.2,0.14,0.0,0.0,0.0,0.0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.1,0.1,0.1,0.11,0.0,0.08,0.0,0.0,0.0,0.0,0.29,0.1,0.0,0.0,0.15,0.0,0.0,0.29,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0.29,0.0,0.0,0.31,0.0,0.0,0.15,0.0,0.25,0.15,0.0,0.0,0.0,0.0,0.12,0.35,0.29,0.0,0.0,0.0,0.0,0.13,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.07,0.0,0.35,0.0,0.0,0.0,0.39,0.29,0.0,0.12,0.12,0.12,0.0,0.0,0.0,0.07,0.35,0.1,0.15,0.29,0.0,0.29,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0.41,0.41,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0
1,0.0,1.0,0.0,0.33,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.14,0.14,0.14,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33,0.0,0.11,0.0,0.17,0.0,0.32,0.0,0.0,0.0,0.13,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.53,0.0,0.0,0.0,0.33,0.29
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.29,0.33,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.67,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.13,0.33,0.33,0.0,0.0,0.0,0.0,0.0,0.33,0.0,0.0,0.0,0.33,0.29
4,0.2,0.0,0.0,0.0,1.0,0.77,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.12,0.06,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.49,0.05,0.05,0.05,0.34,0.16,0.04,0.0,0.0,0.0,0.0,0.21,0.33,0.11,0.0,0.55,0.09,0.36,0.15,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.16,0.0,0.0,0.08,0.0,0.18,0.08,0.0,0.0,0.0,0.0,0.06,0.18,0.0,0.07,0.0,0.06,0.0,0.07,0.0,0.29,0.06,0.0,0.0,0.0,0.0,0.04,0.0,0.18,0.0,0.0,0.0,0.56,0.15,0.0,0.06,0.06,0.06,0.07,0.23,0.0,0.23,0.18,0.19,0.08,0.15,0.0,0.15,0.22,0.0,0.0,0.06,0.2,0.0,0.0,0.48,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.06,0.06,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0
5,0.14,0.0,0.0,0.0,0.77,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.46,0.05,0.05,0.05,0.22,0.06,0.05,0.0,0.2,0.0,0.0,0.24,0.31,0.05,0.0,0.52,0.17,0.34,0.17,0.0,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.18,0.0,0.0,0.08,0.0,0.2,0.08,0.0,0.0,0.0,0.0,0.07,0.2,0.0,0.08,0.0,0.07,0.0,0.08,0.0,0.21,0.24,0.0,0.0,0.0,0.0,0.04,0.0,0.2,0.0,0.0,0.0,0.24,0.17,0.0,0.07,0.07,0.07,0.08,0.17,0.0,0.09,0.2,0.21,0.08,0.17,0.0,0.17,0.1,0.0,0.0,0.07,0.05,0.0,0.0,0.41,0.0,0.0,0.23,0.0,0.0,0.0,0.0,0.0,0.07,0.07,0.0,0.39,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.14
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.41,0.41,0.41,0.0,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.19,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.15,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.11,0.0,0.5,0.0,0.15,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Identify semantic duplicates like “React Conference 2019” ---> “ReactConf ‘19”

similarity_accu = 0.7      # Parameter can be change as per requirement.

counter = 0
for i in range(len(semantic_matrix)):
    for j in range(i+1, len(semantic_matrix)):
        if semantic_matrix.iloc[i, j] >=similarity_accu:
            counter +=1
            print(i+1,'---', j+1)
            print(conf_name[i])
            print(conf_name[j])
            print()

print("Number of semantic duplicates : ", counter)


4 --- 68
AWS Community Day
AWS Community Day

5 --- 6
International Conference On Internet Of Things Big Data Analytics And Information Technology
International Conference On Big Data IoT Cyber Security And Information Technology

15 --- 62
DevOpsDays Vancouver 2020
DevOpsDays Toronto 2020

20 --- 45
Devopscon
DevOpsCon London 2020

25 --- 110
Women in Data Science Pune Conference
The Global Women in Data Science Hyderabad

26 --- 27
MATLAB EXPO 2020 India
MATLAB EXPO 2020 India

26 --- 28
MATLAB EXPO 2020 India
MATLAB EXPO 2020 India

27 --- 28
MATLAB EXPO 2020 India
MATLAB EXPO 2020 India

34 --- 95
Machine Learning Developers
Machine Learning Developers

39 --- 108
Alexa Community Day Bengaluru 2020
Alexa Community Day

48 --- 85
QCon London
QCon

55 --- 67
DevOps Talks Conference Melbourne
DevOps Conference

64 --- 67
 DEVOPS 2020
DevOps Conference

65 --- 130
React Day
React Day Norway

90 --- 91
DevOps Enterprise Summit
DevOps Enterprise Summit

119 --- 120
AWS Summit Mumbai
AWS 