In [77]:
import re
import json
import nltk
import datetime
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/iga_niemiec/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/iga_niemiec/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [86]:
input_json_path = "./analysts.json" 
out_path = "./analysist_counted.json"
target_words = ["Tesla", "Musk"]
start_datetime = datetime.datetime(2019, 10, 19, 11, 20, 0, 0)
# TODO target words from a family of words

In [69]:
# remove stop words and punctation

def remove_stop_and_punctation (input_text):
    
    input_text = input_text.lower()
    
    p = re.compile(r'\w+')

    stop_words = set(stopwords.words('english')) 
  
    word_tokens = word_tokenize(input_text) 
  
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  
    filtered_sentence = [] 
  
    for w in word_tokens: 
        if w not in stop_words and bool(p.match(w)): 
            filtered_sentence.append(w) 
  
    return(filtered_sentence)
    


In [70]:
# stemming
# TODO decide if its better or worse with that

def stem_words (input_text):
    ps = PorterStemmer()

    stemmed_sentence = []

    for word in input_text:
        stemmed_sentence.append(ps.stem(word))
    
    return(stemmed_sentence)
    



In [71]:
# calculating nr of words in string
def calculate_words_nr (stemmed_text, target_words):
    
    target_words = [word.lower() for word in target_words]
    stemmed_words = stem_words(target_words)
    
    words_nr = sum(word in stemmed_words for word in stemmed_text)
    
    return(words_nr)


In [72]:
# generate datetime from timestamps
def convert_stamps_to_datetime (start_datetime, stamp):
    # stamps in milisec
    stamp = 8196
    stamp_us = stamp * 1000

    tst_datetime = start_datetime + datetime.timedelta(0,0, stamp_us)
    tst_datetime = tst_datetime.strftime("%Y-%m-%d %H:%M:%S.%f")
    return(tst_datetime)

In [83]:
# get vader sentiment analysis score
def sentiment_analyzer_score(sentence):
    
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    
    return(score['compound'])

In [90]:
# parse json file and return jsonwith word counting info

def count_words_in_json (json_path, target_words, start_datetime, out_path):

    with open(json_path) as fhandler:
    
        parsed_json = json.load(fhandler)
        
    for obj in parsed_json:
        text = obj['text']
        text = remove_stop_and_punctation(text)
        text = stem_words(text)
        words_nr = calculate_words_nr(text, target_words)
        stamp_datetime = convert_stamps_to_datetime(start_datetime, obj['time'])
        sentiment_score = sentiment_analyzer_score(obj['text'])
    
        obj['text'] = obj['text']
        obj['text_clean'] = text
        obj['time'] = stamp_datetime
        obj['words_nr'] = words_nr
        obj['sentiment_score'] = sentiment_score
    
    with open(out_path, "w") as fhandler:
    
        json.dump(parsed_json, fhandler)
        
    return(parsed_json)
        
    
    


In [91]:
new_json = count_words_in_json(input_json_path, target_words, start_datetime, out_path)

In [92]:
print(new_json)

[{'text': 'this highly unusual and controversial earnings call for Tesla is at the top of Wall Street so you can see', 'time': '2019-10-19 11:20:08.196000', 'text_clean': ['highli', 'unusu', 'controversi', 'earn', 'call', 'tesla', 'top', 'wall', 'street', 'see'], 'words_nr': 1, 'sentiment_score': -0.0679}, {'text': 'a stock sinking', 'time': '2019-10-19 11:20:08.196000', 'text_clean': ['stock', 'sink'], 'words_nr': 0, 'sentiment_score': 0.0}, {'text': 'the minute CEO Elon Musk started insulting analysts', 'time': '2019-10-19 11:20:08.196000', 'text_clean': ['minut', 'ceo', 'elon', 'musk', 'start', 'insult', 'analyst'], 'words_nr': 1, 'sentiment_score': -0.4939}, {'text': "if I'm insulting his own guess what we did was", 'time': '2019-10-19 11:20:08.196000', 'text_clean': ['insult', 'guess'], 'words_nr': 0, 'sentiment_score': -0.4939}, {'text': 'we pulled up some of the most controversial moments during the call listen and judge for yourself', 'time': '2019-10-19 11:20:08.196000', 'text

{'neg': 0.094, 'neu': 0.823, 'pos': 0.082, 'compound': -0.0679}
-0.0679
this highly unusual and controversial earnings call for Tesla is at the top of Wall Street so you can see {'neg': 0.094, 'neu': 0.823, 'pos': 0.082, 'compound': -0.0679}
