## Aggregated News Data Analysis

### This script computes the aggregated result on News data, which will be displayed on the dashboard

#### -Daily News Sentiment score (News page)
#### -Daily News Sentiment score by tagging (News page)

In [6]:
from pymongo import MongoClient
from pprint import pprint
from bson.son import SON
import pandas as pd
from datetime import datetime, timedelta
import time
import matplotlib.pyplot as plt
import json

In [7]:
client_new = MongoClient('mongodb://igenie_readwrite:igenie@35.189.89.82:27017/dax_gcp')
db = client_new.dax_gcp
news_data = db['all_news']
news_data_df = pd.DataFrame(list(db['all_news'].find()))

In [8]:
news_data_df.dtypes

DataModelId                  object
NEWS_DATE_NewsDim            object
NEWS_PUBLICATION_NewsDim     object
NEWS_SOURCE_NewsDim          object
NEWS_TITLE_NewsDim           object
RecordId                     object
RepIdx                       object
RepIdx1                      object
RepIdx2                      object
RepIdx3                      object
_id                          object
categorised_tag              object
constituent                  object
count                         int64
score                       float64
sentiment                    object
show                           bool
dtype: object

In [9]:
news_data_df.head()

Unnamed: 0,DataModelId,NEWS_DATE_NewsDim,NEWS_PUBLICATION_NewsDim,NEWS_SOURCE_NewsDim,NEWS_TITLE_NewsDim,RecordId,RepIdx,RepIdx1,RepIdx2,RepIdx3,_id,categorised_tag,constituent,count,score,sentiment,show
0,CWVB,9/15/2017,M2 Communications - M2 PressWIRE,Acquire Media,Global Football Market Research Report by Play...,DE8190216927_31,0,0,0,0,59df86a6a185722277d895fd,,adidas,2,0.614701,positive,True
1,CWVB,9/14/2017,,Acquire Media,Global Football Market Research Report by Play...,DE8190216927_31,1,1,1,1,59df86a6a185722277d895fe,,adidas,2,0.614701,positive,False
2,CWVB,9/13/2017,Benzinga - Benzinga Lightning Feed,Acquire Media,A Primer For Nike's Q1 Earnings Report,DE8190216927_31,2,2,2,2,59df86a6a185722277d895ff,,adidas,1,0.163299,neutral,True
3,CWVB,9/12/2017,ReleaseWire / SBWire - ReleaseWire,Acquire Media,2017 Global Wireless Health and Fitness Device...,DE8190216927_31,3,3,3,3,59df86a6a185722277d89600,,adidas,1,0.376889,positive,True
4,CWVB,9/12/2017,ReleaseWire / SBWire - ReleaseWire,Acquire Media,Socks Market Is Projected to Reflect at 6.2% C...,DE8190216927_31,4,4,4,4,59df86a6a185722277d89601,Stocks,adidas,1,0.0,neutral,True


In [10]:
##Process the data to get rid of invalid dates
news_data_df=news_data_df[news_data_df['NEWS_DATE_NewsDim']!='Acquire Media']
news_data_df=news_data_df[news_data_df['NEWS_DATE_NewsDim']!='']
news_data_df=news_data_df.ix[news_data_df['NEWS_DATE_NewsDim'].str.len()<=10]
news_data_df = news_data_df.ix[news_data_df['NEWS_DATE_NewsDim'].str.len()>=6]
news_data_df['NEWS_DATE_NewsDim'] = pd.to_datetime(pd.Series(news_data_df['NEWS_DATE_NewsDim']),format="%m/%d/%Y")

## Daily News Sentiment Score

In [20]:
##Obtain the average daily sentiment score of news for a given constituent
#NOTE: USE LOWER CASES for constituent: bmw,adidas,Deutsche Bank, eon, commerzbank
def daily_news_sentiment(news_df,constituent):
    from_date = datetime(2017,9,7)
    to_date = datetime(2017,9,21)
    constituent_df = news_df[news_df['constituent']==constituent] 
    df_crop = constituent_df[(constituent_df['NEWS_DATE_NewsDim']>=from_date)&(constituent_df['NEWS_DATE_NewsDim']<=to_date)]
    test = df_crop[['NEWS_DATE_NewsDim','score']]
    news_sent = df_crop.groupby('NEWS_DATE_NewsDim').mean()
    news_sent = news_sent.add_suffix('score').reset_index() #fix the column positions from an aggregated result. 
    news_sent['score']=news_sent['scorescore']
    news_sent['date']=news_sent['NEWS_DATE_NewsDim'].astype(str) #Fix the column names
    news_sent=news_sent[['date','scorescore']] #only extract the date and score
    news_sent['constituent']=constituent
    return news_sent

In [21]:
news_sent=daily_news_sentiment(news_data_df,'commerzbank') #bmw,adidas,Deutsche Bank, eon, commerzbank

In [22]:
news_sent

Unnamed: 0,date,scorescore,constituent
0,2017-09-07,0.064547,commerzbank
1,2017-09-08,0.002152,commerzbank
2,2017-09-09,0.014055,commerzbank
3,2017-09-10,0.223607,commerzbank
4,2017-09-11,0.030006,commerzbank
5,2017-09-12,0.045331,commerzbank
6,2017-09-13,-0.034459,commerzbank
7,2017-09-14,0.070868,commerzbank
8,2017-09-15,-0.03125,commerzbank
9,2017-09-18,0.167302,commerzbank


In [213]:
##upload results on mongodb
##news_daily_sent_commerzbank,news_daily_sent_bmw,news_daily_sent_adidas,news_daily_sent_eon,news_daily_sent_deutsche_bank
db['news_daily_sent_commerzbank'].drop()
sent_json = json.loads(test.to_json(orient='records'))
db['news_daily_sent_commerzbank'].insert_many(sent_json)

<pymongo.results.InsertManyResult at 0x12c4c2b40>

## Average News Sentiment Score by News Tags

In [24]:
##Only extract the news items that are tagged. 
tagged_news = news_data_df[news_data_df['categorised_tag']!='NA']
tagged_news.head()

Unnamed: 0,DataModelId,NEWS_DATE_NewsDim,NEWS_PUBLICATION_NewsDim,NEWS_SOURCE_NewsDim,NEWS_TITLE_NewsDim,RecordId,RepIdx,RepIdx1,RepIdx2,RepIdx3,_id,categorised_tag,constituent,count,score,sentiment,show
4,CWVB,2017-09-12,ReleaseWire / SBWire - ReleaseWire,Acquire Media,Socks Market Is Projected to Reflect at 6.2% C...,DE8190216927_31,4,4,4,4,59df86a6a185722277d89601,Stocks,adidas,1,0.0,neutral,True
6,CWVB,2017-09-12,ReleaseWire / SBWire - ReleaseWire,Acquire Media,Fitness Equipment Market Is Projected to Refle...,DE8190216927_31,6,6,6,6,59df86a6a185722277d89603,Shares,adidas,1,0.189737,neutral,True
11,CWVB,2017-09-09,Al Bawaba (Middle East) Ltd. - Egypt Independent,Acquire Media,adidas AG (ADS) PT Set at 211.00 by equinet AG...,DE8190216927_31,11,11,11,11,59df86a6a185722277d89608,Shares,adidas,1,0.0,neutral,True
12,CWVB,2017-09-08,Pedia Content Solutions - Plastics Patent News,Acquire Media,Adidas AG Seeks Patent for Expanded Polymer Pe...,DE8190216927_31,12,12,12,12,59df86a6a185722277d89609,Patent,adidas,1,0.088388,neutral,True
13,CWVB,2017-09-07,NewsRx.com - Politics &amp; Government Week,Acquire Media,"Researchers Submit Patent Application, ""System...",DE8190216927_31,13,13,13,13,59df86a6a185722277d8960a,Patent,adidas,1,0.555128,positive,True


In [27]:
def daily_news_sentiment_tagged(news_df,constituent):
    from_date = datetime(2017,9,7)
    to_date = datetime(2017,9,21)
    constituent = 'adidas' #bmw,adidas,Deutsche Bank, eon, commerzbank
    constituent_df = tagged_news[tagged_news['constituent']==constituent]
    #Change format of the date column
    constituent_df['NEWS_DATE_NewsDim'] = pd.to_datetime(pd.Series(constituent_df['NEWS_DATE_NewsDim']),format="%m/%d/%Y")
    df_crop = constituent_df[(constituent_df['NEWS_DATE_NewsDim']>=from_date)&(constituent_df['NEWS_DATE_NewsDim']<=to_date)]
    df_crop = df_crop[['NEWS_DATE_NewsDim','categorised_tag','score']]
    result = df_crop.groupby(['NEWS_DATE_NewsDim','categorised_tag']).mean()
    result = result.add_suffix('score').reset_index()
    result['constituent']=constituent
    result['date']=result['NEWS_DATE_NewsDim'].astype(str)
    result = result[['constituent','date','categorised_tag','scorescore']]
    ##Calcualte average score, grouped by date, grouped by tag. 
    return result

In [28]:
tag_score = daily_news_sentiment_tagged(tagged_news,'adidas') #bmw,adidas,Deutsche Bank, eon, commerzbank

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [29]:
tag_score

Unnamed: 0,constituent,date,categorised_tag,scorescore
0,adidas,2017-09-07,Patent,0.555128
1,adidas,2017-09-07,Shares,0.298151
2,adidas,2017-09-08,Patent,0.088388
3,adidas,2017-09-09,Shares,0.0
4,adidas,2017-09-12,Shares,0.189737
5,adidas,2017-09-12,Stocks,0.0


In [151]:
##upload on mongodb
##news_daily_sent_commerzbank,bmw,adidas,eon,deutsche_bank
tagscore_json = json.loads(tag_score.to_json(orient='records'))
db['news_tagging_score'].insert_many(tagscore_json)

<pymongo.results.InsertManyResult at 0x121221190>