In [1]:
import os
import json
import pandas as pd
import pymongo
import dns
import json
from nltk.sentiment import SentimentAnalyzer

### read the query-doc JSON file

In [2]:
# define the directory path
path_to_json = '/Users/emadarmiti/Desktop/cap-s5/places_ranking/es_qureries/'

# define the query-document json file 
query_doc = json.loads('{}')

# get all json files that exist in the directory
for file_path in os.listdir(path_to_json):
    
    if file_path.endswith('.json'):
        
        # open the json file 
        with open(os.path.join(path_to_json,file_path)) as json_file:
            
            # append the json file, to get all of them in on json variable
            query_doc.update(json.load(json_file))
                       


In [3]:
len(query_doc)

23079

### connect to MongoDB

In [4]:
# define the mongoDB endpoint
mongoDB = "mongodb://rama:rama@cluster0-shard-00-00.xlj8q.mongodb.net:27017,cluster0-shard-00-01.xlj8q.mongodb.net:27017,cluster0-shard-00-02.xlj8q.mongodb.net:27017/tweets?ssl=true&replicaSet=atlas-yi054u-shard-0&authSource=admin&retryWrites=true&w=majority"
        
# defien mongodb client
mongoDB_client = pymongo.MongoClient(mongoDB)

In [5]:
# get the database
tweets_database = mongoDB_client['tweets']

# get the collection
tweets_collection = tweets_database['tweets2018']

### find the tweets that near to the documents

In [6]:
def find_tweets_near_place(coordinates):
    """
    This function is used for retrieving tweets in 100 m range around the document location 
    
    [params] : coordinates : the document location 
    
    [return] : list of tweets
    
    """
    
    # define the body of the query
    myquery = {
       "place.coordinates": {
           "$near": {
           "$geometry": {
              "type": "Point" ,
              "coordinates": coordinates
           },
           "$maxDistance": 100,
           "$minDistance": 0
         }
       }
    }

    # send the query and return the results
    return list(tweets_collection.find(myquery))


In [16]:
# define the result dictt
tweets_query_doc = {}

In [17]:
# go over each query-doc and use the previous function to get the tweets from the MongoDB
for query in list(query_doc.keys()):
    
    doc_tweets = []
    
    for doc in query_doc[query]:
        
        # slice the location of the document
        coordinates = list(doc['_source']['location'].values())
        
        # get the tweets
        tweets = find_tweets_near_place(coordinates)
        
        # delete the mongoDB id which is an object
        for index in range(len(tweets)):
            tweets[index].pop('_id',None)
            tweets[index]['elasticsearch_score'] = doc['_score']
        
        # neglect the doc that don't have tweets
        if bool(tweets):
            
            doc_tweets.append({"name" : doc['_source']['name'], 
                               "location": doc['_source']['location'],
                               "tweets":tweets })
    
    # neglect the queries that don't have documents with tweets
    if bool(doc_tweets) :
        tweets_query_doc[query] = doc_tweets

### save the query-doc-tweets into a JSON file

In [36]:
def dict_to_json(dictionary, outfile_path):
    """
    This function is used for storing a dict into json file 
    [params] : dictionary : the data
    [params] : outfile_path : the path of the out json file
    """
    
    with open(outfile_path, "w") as outfile:  
        json.dump(dictionary, outfile) 

In [37]:
# define the outfile path
json_outfile_path = "/Users/emadarmiti/Desktop/cap-s5/places_ranking/tweets_query_doc.json"

# call the function to store the data to the json
dict_to_json(tweets_query_doc, json_outfile_path)


### build the dataframe 

In [9]:
# define the keys to slice from tweets
KEYS = ['query', 'document', "time", "tweet", "replies_count",
        "retweets_count", "likes_count","elasticsearch_score"]

In [10]:
# create an empty dataframe 
data = pd.DataFrame(columns = KEYS) 

In [11]:
data

Unnamed: 0,query,document,time,tweet,replies_count,retweets_count,likes_count,elasticsearch_score


In [12]:

for query in list(tweets_query_doc.keys()):
    
    result = {KEYS[0] : query}
    
    for doc in list(tweets_query_doc[query].keys()):
        
        result[KEYS[1]] = doc
        
        for tweet in tweets_query_doc[query][doc]:
            
            for key in KEYS[2:]:
                
                result[key] = tweet[key]
                
            data = data.append(result, ignore_index=True)
            
        
        

In [13]:
data

Unnamed: 0,query,document,time,tweet,replies_count,retweets_count,likes_count,elasticsearch_score
0,Dix/Adler,Johnathan Adler,19:44:21,"I'm at Neofytos in New York, NY https://t.co/...",0,0,0,10.596428
1,Dix/Adler,Johnathan Adler,20:47:04,"I'm at Caffe Grazie in New York, NY https://t...",0,0,0,10.596428
2,Dix/Adler,Johnathan Adler,17:50:53,"I'm at Starbucks Reserve in New York, NY w/ @p...",0,0,0,10.596428
3,Dix/Adler,Johnathan Adler,16:44:09,"I'm at Starbucks Reserve in New York, NY http...",0,0,0,10.596428
4,Dix/Adler,Johnathan Adler,01:52:18,"I'm at Starbucks Reserve in New York, NY http...",0,0,0,10.596428
...,...,...,...,...,...,...,...,...
1913,Esty,Kool-est Shoes,12:25:53,"I'm at LA Fitness in Staten Island, NY https:...",0,0,0,6.601116
1914,Esty,Kool-est Shoes,12:27:51,"I'm at LA Fitness in Staten Island, NY https:...",0,0,0,6.601116
1915,Esty,Kool-est Shoes,12:25:31,"I'm at LA Fitness in Staten Island, NY https:...",1,0,0,6.601116
1916,Esty,Kool-est Shoes,12:16:39,"I'm at LA Fitness in Staten Island, NY https:...",0,0,0,6.601116


In [14]:
data.groupby(['query', 'document']).agg({'tweet': 'count', 'replies_count': 'sum', 'likes_count': 'sum'
                                        ,'retweets_count': 'sum', 'elasticsearch_score': 'first'}).reset_index()

Unnamed: 0,query,document,tweet,replies_count,likes_count,retweets_count,elasticsearch_score
0,Dix/Adler,Johnathan Adler,17,0,2,0,10.596428
1,Dix/Adler,Jonathan Adler,30,0,3,0,10.596423
2,Downt,Chau Down,43,7,5,1,7.047823
3,Downt,Dont know,66,1,6,0,7.047838
4,Downt,Double Down Saloon,51,3,13,1,6.029442
5,Downt,Up & Down,151,5,57,2,7.047873
6,EastR,East Wok,6,0,4,0,4.434819
7,EastRi,Eastern,6,0,0,0,4.747308
8,EastRi,Kasuri,3,2,1,0,4.749061
9,Elk La,La La Land,5,0,0,0,6.982682


In [None]:
#data.to_csv('/Users/emadarmiti/Desktop/cap-s5/places_ranking/training_data.csv')