In [108]:
import os
import json
import pandas as pd
import pymongo
import dns
import json
from nltk.sentiment import SentimentAnalyzer
import haversine as hs
from sklearn.metrics.pairwise import cosine_similarity
import random
import sys
import math
from random import randrange
import numpy as np
from sklearn import preprocessing
import math

### read the query-doc JSON file

In [56]:
# define the directory path
path_to_json = '/Users/emadarmiti/Desktop/cap-s5/places_ranking/es_qureries/'

# define the query-document json file 
query_doc = json.loads('{}')

# get all json files that exist in the directory
for file_path in os.listdir(path_to_json):
    
    if file_path.endswith('.json'):
        
        # open the json file 
        with open(os.path.join(path_to_json,file_path)) as json_file:
            
            # append the json file, to get all of them in on json variable
            query_doc.update(json.load(json_file))

In [57]:
len(query_doc)

23079

### connect to MongoDB

In [58]:
def connect_mongodb(mongoDB_url, database_name, collection_name):
    """connect to the mongoDB 

    Args:
        mongoDB_url : mongoDB endpoint url
        database_name : database name
        collection_name : collection name

    Returns:
        the database collection object
    """

    # create mongodb client
    mongoDB_client = pymongo.MongoClient(mongoDB_url)
    
    # get the database
    tweets_database = mongoDB_client[database_name]

    # get the collection
    tweets_collection = tweets_database[collection_name]
    
    # return the collection object
    return tweets_collection

In [59]:
# define the mongoDB endpoint
mongodb_url = "mongodb://rama:rama@cluster0-shard-00-00.xlj8q.mongodb.net:27017,cluster0-shard-00-01.xlj8q.mongodb.net:27017,cluster0-shard-00-02.xlj8q.mongodb.net:27017/tweets?ssl=true&replicaSet=atlas-yi054u-shard-0&authSource=admin&retryWrites=true&w=majority"

# connect to the database
tweets_collection = connect_mongodb(mongodb_url, "tweets", "tweetsFinal")

### find the tweets that near to the documents

In [60]:
def find_tweets_near_place(coordinates):
    """find the tweets within 100m radius from the passed coordinates

    Args:
       coordinates : lon,lat for the central point

    Returns:
        extracted info from the tweets
    """
    
    # define the body of the query
    myquery =  [
        
      { "$geoNear": {
          
             "near": { "type": "Point", "coordinates": coordinates },
             "distanceField": "place.coordinates",
             "maxDistance": 100}}
        
        
    ,{  "$group": { 
        
            "_id": None,
            "tweets_count": { "$sum": 1 },
            "tweets_average_length" : { "$avg" : {"$strLenCP" : "$tweet"}},
            "replies_count": { "$sum": "replies_count"},
            "retweets_count": { "$sum": "retweets_count"}, 
            "likes_count": { "$sum": "likes_count"},
            "popularity": { "$avg": "$popularity"},
            "hashtags": { "$sum": {"$size" : "$hashtags"}},
            "mentions": { "$sum": {"$size" : "$mentions"}}}}]

    # send the query and return the results
    return list(tweets_collection.aggregate(myquery))

### build the dataset

In [61]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [62]:
def get_random_coordinates(coordinates, radius):
                   
    radiusInDegrees=radius/111300            
    r = radiusInDegrees

    u = float(random.uniform(0.0,1.0))
    v = float(random.uniform(0.0,1.0))
    w = r * math.sqrt(u)
    t = 2 * math.pi * v
    x = w * math.cos(t) 
    y = w * math.sin(t)

    xLat  = x + coordinates[0]
    yLong = y + coordinates[1]

    return [xLat, yLong]

In [63]:
def build_dataset(documents, query):
    
    # get the coordinates for one document to get a random user lcation 
    temp_coordiates = list(documents[0]['_source']['location'].values())
    
    # get a random user location 
    user_location = get_random_coordinates(temp_coordiates, randrange(10))
    
    # define the list result of the documents data
    documents_data = []
    
    # go over each document and gather its info
    for doc in documents:
       
        # slice the location of the document
        coordinates = list(doc['_source']['location'].values())
        
        
        # get the tweets
        tweets = find_tweets_near_place(coordinates)
        
        # neglect the doc that don't have tweets
        if len(tweets):
            
            # get the document name
            document = doc['_source']['name']
            
            # find the jaccard similarity
            jaccard_entire = jaccard_similarity(query.split(" "), document.split(" "))
            
            # slice the first 3 characters and find the jaccard similaity
            sub_query = [word[:3] for word in query.split(" ")]
            sub_document = [word[:3] for word in document.split(" ")]
            
            sub_jaccard = jaccard_similarity(sub_query, sub_document)
            
            # check if the query and the document have the same prefix
            prefix_match = query[:3] == document[:3]
            
            # create two tuples of coordinates
            tweet_loc = (coordinates[0], coordinates[1])
            user_loc = (user_location[0], user_location[1])

            # get the distance between the user and the document(in meters)
            distance = hs.haversine(tweet_loc,user_loc)
            
            # define the dict for the document
            doc_data = {"query" : query,
                        "document" : document,
                        "query_length" : len(query),
                        "document_length" : len(doc['_source']['name']),
                        "jaccard_entire" : jaccard_entire,
                        "sub_jaccard" : sub_jaccard,
                        "prefix_match" : prefix_match,
                        "elasticsearch_score" : doc['_score'],
                        "distance" : distance}
            
            
            # add the tweets info
            doc_data.update(tweets[0])
            
            # append the doc dict to the list result
            documents_data.append(doc_data)
    
    # return the documents data
    return documents_data

In [188]:
# dataframem empty
data = pd.DataFrame(columns = ['query', 'document', 'query_length', 'document_length',
       'jaccard_entire', 'sub_jaccard', 'prefix_match', 'elasticsearch_score',
       'distance', 'tweets_count', 'tweets_average_length', 'replies_count',
       'retweets_count', 'likes_count', 'popularity', 'hashtags', 'mentions']) 

In [189]:
def scaling(nubmers):
    scale = preprocessing.minmax_scale(nubmers, feature_range = ((1, 2, 3, 4, 5)[-len(nubmers):][0],
                                                           (1, 2, 3, 4, 5)[-len(nubmers):][-1]))
    return np.around(scale)

In [190]:
# go over each query-doc and use the previous function to get the tweets info from the MongoDB
for query in list(query_doc.keys())[:10]:
    
    # build the dataset for one query
    query_doc_data = build_dataset(query_doc[query], query)
    
    # neglect the queries that don't have documents with tweets
    if len(query_doc_data):
        
        #create a small dataframe
        data_sub = pd.json_normalize(query_doc_data)
        data_sub = data_sub.drop('_id', 1)
        
        #add the socres
        distance = np.array(data_sub['distance'])
        jaccard = np.array(data_sub['sub_jaccard'])
        
        if len(distance) == 1:
            data_sub['label'] = [5]
            
        else:
            scaled_distance = scaling(-distance)
            scaled_jaccard = scaling(jaccard)
            
            label =  np.array(1.1 * scaled_jaccard +  scaled_distance)
        
            data_sub['label'] = scaling(label)

        #append to the original one
        data = data.append(data_sub)
        
        

In [191]:
data

Unnamed: 0,query,document,query_length,document_length,jaccard_entire,sub_jaccard,prefix_match,elasticsearch_score,distance,tweets_count,tweets_average_length,replies_count,retweets_count,likes_count,popularity,hashtags,mentions,label
0,Dix/Adler,Johnathan Adler,9,15,0.0,0.0,False,10.596428,372.269962,17,75.117647,0,0,0,1.512406,0,10,5.0
1,Dix/Adler,Jonathan Adler,9,14,0.0,0.0,False,10.596423,372.302677,30,76.233333,0,0,0,1.612857,0,22,4.0
0,Downt,Up & Down,5,9,0.0,0.333333,False,7.047873,0.000249,150,73.853333,0,0,0,1.490926,3,82,5.0
1,Downt,Dont know,5,9,0.0,0.0,False,7.047838,4.097123,65,74.338462,0,0,0,1.415385,0,32,2.0
2,Downt,Chau Down,5,9,0.0,0.5,False,7.047823,5.818265,42,75.285714,0,0,0,1.517207,0,17,4.0
3,Downt,Double Down Saloon,5,18,0.0,0.333333,False,6.029442,2.301789,51,75.019608,0,0,0,1.607843,0,36,4.0
0,EastRi,Kasuri,6,6,0.0,0.0,False,4.749061,0.002035,3,84.333333,0,0,0,1.333333,0,2,4.0
1,EastRi,Eastern,6,7,0.0,1.0,True,4.747308,166.125773,6,68.0,0,0,0,1.5,0,4,5.0
0,EastR,East Wok,5,8,0.0,0.5,True,4.434819,143.574125,6,66.833333,0,0,0,1.166667,0,6,5.0
0,Elk La,La La Taqueria,6,14,0.333333,0.333333,False,6.982998,310.706174,3,182.333333,0,0,0,1.321908,0,1,5.0


In [74]:
#data.to_csv('/Users/emadarmiti/Desktop/cap-s5/places_ranking/row_data.csv')