# Aggregated Twitter Analysis

### This code performs aggregates on the 'tweets' collection to obtain data for the Dashboard.

#### -Twitter count and sentiment by geography (Twitter Page)
#### -Top Organizations/words mentioned in Twitter(Twitter Page)
#### -Top positive words (not shown on dashboard)

In [39]:
import pymongo
from pymongo import MongoClient
import pandas as pd
import datetime
from datetime import datetime, timedelta
import time
import matplotlib.pyplot as plt
import numpy as np
from decimal import Decimal
import re
from re import sub
import json
import bson
from bson.son import SON

In [9]:
client_new = MongoClient('mongodb://igenie_readwrite:igenie@35.189.89.82:27017/dax_gcp')
db = client_new.dax_gcp
twitter_analysis = list(db['twitter_analytics'].find())
twitter_analysis=  pd.DataFrame(twitter_analysis)
tweets = db['tweets']

## Number and Sentiment of countries tweeting about a constituent

In [18]:
def country_count(tweets,constituent):
    #Count by geography
    start_date = datetime(2017,7,21)
    finish_date = datetime(2017,10,6)
    pipeline = [
        {"$match":{"constituent":constituent, 'relevance':1,
               "date":{"$gte":start_date, "$lte":finish_date}}
        },
        {"$group":{"_id":"$place.country_code", "count":{"$sum":1}}}
        ,{"$sort":SON([("count",-1),("_id",-1)])}
        ]
    topcountry = list(tweets.aggregate(pipeline))
    df_count= pd.DataFrame(topcountry)
    df_count['constituent']=constituent
    
    #Sentiment by geography
    pipeline = [
    {"$match":{"constituent":constituent,"relevance":1,
               "date":{"$gte":start_date, "$lte":finish_date}}},
    {"$group":{"_id":"$place.country_code", "avg_sentiment":{"$avg":"$nltk_sentiment_numeric"}}}
        ,{"$sort":SON([("avg_sentiment",-1),("_id",-1)])}
    ]
    df_sentiment = pd.DataFrame(list(tweets.aggregate(pipeline)))
    df_sentiment['country']=df_sentiment['_id']
    
    ##Merge data on sentiment and count together, removing the id column
    count_sentiment = df_count.merge(df_sentiment,on='_id',how='inner')
    count_sentiment = count_sentiment.drop('_id',axis=1)
    count_sentiment['status']='active'
    return count_sentiment

In [19]:
count_sentiment=country_count(tweets,'BMW')

In [25]:
count_sentiment.head()

Unnamed: 0,count,constituent,avg_sentiment,country
0,3,BMW,-0.213933,ZA
1,6,BMW,-0.14085,DE
2,1,BMW,0.0,AE
3,1,BMW,0.0,NZ
4,5,BMW,0.13194,MX


In [20]:
### Add results to collection, country_data
##count_sentiment_json = json.loads(count_sentiment.to_json(orient='records'))
#db['country_data'].insert_many(count_sentiment_json)

In [47]:
##Or collect geographical analylysis, iterating through a list of constituents
def country_count_collection(tweets_collection,constituent_list,country_data_collection):
    for constituent in constituent_list:
        #Update the status of the previous data
        country_data_collection.update_many({'constituent':constituent,'status':'active'}, {'$set': {'status': 'inactive'}},True,True)
        count_sentiment=country_count(tweets,constituent)
        count_sentiment_json = json.loads(count_sentiment.to_json(orient='records'))
        country_data_collection.insert_many(count_sentiment_json)

In [None]:
constituent_list = ['adidas','BMW','Commerzbank','Deutsche Bank','EON']
tweets_collection = tweets
country_data_collection=db['country_data']
country_count_collection(tweets_collection,constituent_list,country_data_collection)

## Top positive words

In [37]:
#This function gives the counts of top positive words for one constituent
def top_positive_words(tweets,constituent):
    from_date = datetime(2017,7,21)
    to_date = datetime(2017,10,6)
    pipeline = [
        {"$match":{"constituent":constituent, 
               "date":{"$gte":from_date, "$lte":to_date},
               "nltk_sentiment_numeric":{"$gte":0.25}}},
        {"$unwind":"$processed_text"},
        {"$group":{"_id":"$processed_text", "count":{"$sum":1}}},
        {"$sort":SON([("count",-1),("_id",-1)])}]
    top_positive_words = list(tweets.aggregate(pipeline))
    top_positive_words = pd.DataFrame(top_positive_words)
    top_positive_words['positive_words']=top_positive_words['_id']
    top_positive_words['constituent']=constituent
    top_positive_words['status']='active'
    top_ten_positive_words=top_positive_words.iloc[:10]
    return top_ten_positive_words

In [40]:
top_ten_positive_words=top_positive_words(tweets,'BMW')

In [41]:
top_ten_positive_words
#Add results into collection, top_ten_postiive_words
#top_words_json = json.loads(top_words.to_json(orient='records'))
#db['top_positive_words'].insert_many(top_words_json)

Unnamed: 0,_id,count,positive_words,constituent
0,bmw,123177,bmw,BMW
1,rt,63313,rt,BMW
2,…,28706,…,BMW
3,like,24705,like,BMW
4,pleas,15065,pleas,BMW
5,new,12633,new,BMW
6,car,12210,car,BMW
7,#cars,10254,#cars,BMW
8,i8,10168,i8,BMW
9,video,9686,video,BMW


## Top organisations/mentioned in tweets

In [45]:
##This function collects the count of tweets of top organizations for one constituent
def top_organizations(tweets,constituent):
    from_date = datetime(2017,7,21)
    to_date = datetime(2017,10,6)
    pipeline = [
    {"$match":{"constituent":constituent,"relevance":1, 
               "date":{"$gte":from_date, "$lte":to_date}
              }
    },
    {"$unwind":"$tag_ORGANIZATION"},
    {"$group":{"_id":"$tag_ORGANIZATION", "count":{"$sum":1}}}
    ,{"$sort":SON([("count",-1),("_id",-1)])}
    ]
    organization_list = list(tweets.aggregate(pipeline))
    organization_df = pd.DataFrame(organization_list)
    organization_df['organisation']=organization_df['_id']
    organization_df['constituent']=constituent
    #Extract the top 10 organizations mentioned
    organization_df=organization_df.iloc[:10]
    return organization_df

In [46]:
organization_df=top_organizations(tweets,'BMW')
#organization_json = json.loads(organization_df.to_json(orient='records'))
#db['twitter_top_organizations'].insert_many(organizations_json)

In [48]:
organization_df.head()

Unnamed: 0,_id,count,organisation,constituent
0,bmw,4934,bmw,BMW
1,daimler,888,daimler,BMW
2,brexit,584,brexit,BMW
3,vw,308,vw,BMW
4,porsche,251,porsche,BMW


In [None]:
##This function collects the top organization data for a given list of constituents
def top_organizations_collection(tweets_collection,constituent_list,top_organization_collection):
    for constituent in constituent_list:
        #Update the status of the previous data
        top_organization_collection.update_many({'constituent':constituent,'status':'active'}, {'$set': {'status': 'inactive'}},True,True)
        organization_df=top_organizations(tweets,constituent)
        organization_json = json.loads(organization_df.to_json(orient='records'))
        top_organization_collection.insert_many(organizations_json)
        