In [322]:
import os
import json
import pandas as pd
import pymongo
import dns
import json
from nltk.sentiment import SentimentAnalyzer
import haversine as hs
from sklearn.metrics.pairwise import cosine_similarity


### read the query-doc JSON file

In [242]:
# define the directory path
path_to_json = '/Users/emadarmiti/Desktop/cap-s5/places_ranking/es_qureries/'

# define the query-document json file 
query_doc = json.loads('{}')

# get all json files that exist in the directory
for file_path in os.listdir(path_to_json):
    
    if file_path.endswith('.json'):
        
        # open the json file 
        with open(os.path.join(path_to_json,file_path)) as json_file:
            
            # append the json file, to get all of them in on json variable
            query_doc.update(json.load(json_file))

In [243]:
len(query_doc)

23079

### connect to MongoDB

In [244]:
def connect_mongodb(mongoDB_url, database_name, collection_name):
    """connect to the mongoDB 

    Args:
        mongoDB_url : mongoDB endpoint url
        database_name : database name
        collection_name : collection name

    Returns:
        the database collection object
    """

    # create mongodb client
    mongoDB_client = pymongo.MongoClient(mongoDB_url)
    
    # get the database
    tweets_database = mongoDB_client[database_name]

    # get the collection
    tweets_collection = tweets_database[collection_name]
    
    # return the collection object
    return tweets_collection

In [245]:
# define the mongoDB endpoint
mongodb_url = "mongodb://rama:rama@cluster0-shard-00-00.xlj8q.mongodb.net:27017,cluster0-shard-00-01.xlj8q.mongodb.net:27017,cluster0-shard-00-02.xlj8q.mongodb.net:27017/tweets?ssl=true&replicaSet=atlas-yi054u-shard-0&authSource=admin&retryWrites=true&w=majority"

# connect to the database
tweets_collection = connect_mongodb(mongodb_url, "tweets", "tweets2018")

### find the tweets that near to the documents

In [255]:
def find_tweets_near_place(coordinates):
    """find the tweets within 100m radius from the passed coordinates

    Args:
       coordinates : lon,lat for the central point

    Returns:
        extracted info from the tweets
    """
    
    # define the body of the query
    myquery =  [
        
      { "$geoNear": {
          
             "near": { "type": "Point", "coordinates": coordinates },
             "distanceField": "place.coordinates",
             "maxDistance": 100}}
        
        
    ,{  "$group": { 
        
            "_id": None,
            "tweets_count": { "$sum": 1 },
            "tweets_average_length" : { "$avg" : {"$strLenCP" : "$tweet"}},
            "replies_count": { "$sum": "replies_count"},
            "retweets_count": { "$sum": "retweets_count"}, 
            "likes_count": { "$sum": "likes_count"},
            "hashtags": { "$sum": {"$size" : "$hashtags"}},
            "mentions": { "$sum": {"$size" : "$mentions"}}}}]

    # send the query and return the results
    return list(tweets_collection.aggregate(myquery))

### build the dataset

In [306]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [317]:
def build_dataset(documents, query, user_location):
    
    # define the list result of the documents data
    documents_data = []
    
    # go over each document and gather its info
    for doc in documents:
       
        # slice the location of the document
        coordinates = list(doc['_source']['location'].values())
        
       
        
        # get the tweets
        tweets = find_tweets_near_place(coordinates)
        
        # neglect the doc that don't have tweets
        if len(tweets):
            
            # get the document name
            document = doc['_source']['name']
            
            
            # find the jaccard similarity
            jaccard_entire = jaccard_similarity(query.split(" "), document.split(" "))
            
            # slice the first 3 characters and find the jaccard similaity
            sub_query = [word[:3] for word in query.split(" ")]
            sub_document = [word[:3] for word in document.split(" ")]
            
            sub_jaccard = jaccard_similarity(sub_query, sub_document)
            
            # check if the query and the document have the same prefix
            prefix_match = query[:3] == document[:3]
            
            # create two tuples of coordinates
            tweet_loc = (coordinates[0], coordinates[1])
            user_loc = (user_location[0], user_location[1])

            # get the distance between the user and the document
            distance = hs.haversine(tweet_loc,user_loc)
            
            # define the dict for the document
            doc_data = {"query" : query,
                        "document" : document,
                        "query_length" : len(query),
                        "document_length" : len(doc['_source']['name']),
                        "jaccard_entire" : jaccard_entire,
                        "sub_jaccard" : sub_jaccard,
                        "prefix_match" : prefix_match,
                        "elasticsearch_score" : doc['_score'],
                        "distance" : distance}
            
            
            # add the tweets info
            doc_data.update(tweets[0])
            
            # append the doc dict to the list result
            documents_data.append(doc_data)
    
    # return the documents data
    return documents_data

In [318]:
# define the result dict for all query-doc 
tweets_query_doc = []

In [319]:
# go over each query-doc and use the previous function to get the tweets info from the MongoDB
for query in list(query_doc.keys())[:10]:
    
    # build the dataset for one query
    query_doc_data = build_dataset(query_doc[query], query, [40.7128, -74.0060])
    
    # neglect the queries that don't have documents with tweets
    if len(query_doc_data):
        tweets_query_doc.extend(query_doc_data)

In [320]:
data = pd.json_normalize(tweets_query_doc)
data = data.drop('_id', 1)

In [321]:
data

Unnamed: 0,query,document,query_length,document_length,jaccard_entire,sub_jaccard,prefix_match,elasticsearch_score,distance,tweets_count,tweets_average_length,replies_count,retweets_count,likes_count,hashtags,mentions
0,Dix/Adler,Johnathan Adler,9,15,0.0,0.0,False,10.596428,8.283963,17,75.117647,0,0,0,0,10
1,Dix/Adler,Jonathan Adler,9,14,0.0,0.0,False,10.596423,7.73684,30,76.233333,0,0,0,0,22
2,Downt,Up & Down,5,9,0.0,0.333333,False,7.047873,2.970166,151,73.97351,0,0,0,3,83
3,Downt,Dont know,5,9,0.0,0.0,False,7.047838,1.883768,66,74.681818,0,0,0,0,32
4,Downt,Chau Down,5,9,0.0,0.5,False,7.047823,3.38079,43,75.023256,0,0,0,0,18
5,Downt,Double Down Saloon,5,18,0.0,0.333333,False,6.029442,2.016149,51,75.019608,0,0,0,0,36
6,EastRi,Kasuri,6,6,0.0,0.0,False,4.749061,172.513834,3,84.333333,0,0,0,0,2
7,EastRi,Eastern,6,7,0.0,1.0,True,4.747308,8.641929,6,68.0,0,0,0,0,4
8,EastR,East Wok,5,8,0.0,0.5,True,4.434819,223.48208,6,66.833333,0,0,0,0,6
9,Elk La,La La Taqueria,6,14,0.333333,0.333333,False,6.982998,32.076999,3,182.333333,0,0,0,0,1


In [277]:
data.columns

Index(['query', 'document', 'query_length', 'document_length',
       'elasticsearch_score', 'distance', 'tweets_count',
       'tweets_average_length', 'replies_count', 'retweets_count',
       'likes_count', 'hashtags', 'mentions'],
      dtype='object')

In [74]:
#data.to_csv('/Users/emadarmiti/Desktop/cap-s5/places_ranking/row_data.csv')


### build the dataframe 

### save the query-doc-tweets into a JSON file

# tf-idf?
# should we delete non tweets doc?
# why just two featrues in labeling?
# hashtags
# query/document length
# nlp
# tweets average length
# mentions - boolean
## label = nlp + distance + elasicsearch + tweets?


In [293]:
from sklearn.feature_extraction.text import TfidfTransformer

victorizer = CountVectorizer(stop_words='english', lowercase=True)

term_frequency = victorizer.fit_transform(['emad','em'])

In [295]:
tf_idf_tranformer = TfidfTransformer(sublinear_tf=True, smooth_idf=False)

   
tf_idf = tf_idf_tranformer.fit_transform(term_frequency).T.todense()



In [296]:
tf_idf

matrix([[0., 1.],
        [1., 0.]])