In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import string
import json
import matplotlib.pyplot as plt

In [2]:
stopwords = stopwords.words('english')

In [35]:
common_words = ['bid', 'offer', 'buy', 'sell', 'put', 'minus', 'plus']

In [36]:
regex_digit = re.compile('[+/-]?\d*\.?\d+')

In [37]:
# md_chat = pd.read_csv('../data/processed/aligned_chatlogs_msarosiak1_edixon.csv')
# md_chat = md_chat.fillna('NAN')
aligned_chat = pd.DataFrame()
for file_name in os.listdir("../data/processed/aligned/"):  
    if file_name.find('aligned') != -1:
        # print(file_name)
        aligned_chat = aligned_chat.append(pd.read_csv('../data/processed/aligned/'+file_name).dropna(axis=0))

In [38]:
aligned_chat.shape

(2387, 32)

In [39]:
grouped = aligned_chat.groupby('pipeline_name')\
.agg({'post_date': 'count'})\
.sort_values(by='post_date', ascending=False)

In [40]:
grouped.head(10)

Unnamed: 0_level_0,post_date
pipeline_name,Unnamed: 1_level_1
Enb. ND,921
DAPL,239
Double H,239
White Cliffs,213
Grand Mesa,111
Newtown,66
Enb. T@S,55
Enbridge,49
Colt,43
Husky,40


In [65]:
top_pipeline_names = grouped[grouped['post_date'] > 10].index.tolist()

In [66]:
# top_market_names = aligned_chat.groupby('market_name')\
# .agg({'post_date': 'count'})\
# .sort_values(by='post_date', ascending=False).index.tolist()
# # top_market_names = [market.lower() for market in top_market_names]

In [67]:
top_pipeline_names

['Enb. ND',
 'DAPL',
 'Double H',
 'White Cliffs',
 'Grand Mesa',
 'Newtown',
 'Enb. T@S',
 'Enbridge',
 'Colt',
 'Husky',
 'Guern HUB',
 'Deep Rock',
 'Savage',
 'Enterprise',
 'Hiland',
 'Butte',
 '-',
 'Saddlehorn',
 'CLP-H',
 'PXP']

In [68]:
def filter_tokenize(words):
    
    words = nltk.word_tokenize(messages)

    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]

    # Remove numbers
    words = [word for word in words if not word.isnumeric()]
    
    # Remove words with numbers
    words = [word for word in words if len(regex_digit.findall(word)) == 0]
    # print(words)

    # Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]
    
    # Strip newline and spaces
    words = [word.strip('\n\\n\r ') for word in words]

    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer('german')
    # words = [stemmer.stem(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in stopwords]
    
    # Remove common words
    words = [word for word in words if word not in common_words]
    
    # Remove word if only one character
    words = [word for word in words if word.count(word[0]) < len(word)]
    
    words = ' '.join(words)
    
    return words

In [69]:
pipeline_name_words = {}
for pipeline_name in top_pipeline_names:
    
    # print('Market_Name: ', market_name)
    messages = ' '.join(aligned_chat.loc[aligned_chat.pipeline_name == pipeline_name, 'message'])
    
    # Remove words with all puntuations
    messages = messages.translate(str.maketrans('','',string.punctuation))


    # Calculate frequency distribution
    # fdist = nltk.FreqDist(words)
    
    messages = filter_tokenize(messages)

    pipeline_name_words[pipeline_name] = messages
    
    # Output top 10 words
    # for word, frequency in fdist.most_common(10):
        # print(u'{}:{}'.format(word, frequency))

In [70]:
# pipeline_name_words.values()

In [71]:
count_vect = CountVectorizer()
count_vect = count_vect.fit(pipeline_name_words.values())
freq_term_matrix = count_vect.transform(pipeline_name_words.values())

In [72]:
tfidf = TfidfTransformer(norm='l2')
tfidf.fit(freq_term_matrix)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [75]:
# L2
pipeline_associated_words = {}
for pipeline_name in top_pipeline_names:
    # print(market_name)
    doc_freq_term = count_vect.transform([pipeline_name_words[pipeline_name]])
    doc_tfidf_matrix = tfidf.transform(doc_freq_term)
    # print(np.percentile(doc_tfidf_matrix.toarray()[0], [100, 99.75, 99]))
    
    # decreasing_index = sorted(range(len(count_vect.get_feature_names())), key=lambda k: doc_tfidf_matrix.toarray()[0].tolist()[k], reverse=True)
    # print(np.array(count_vect.get_feature_names())[decreasing_index[0:5]])
    
    # market_associated_words[market_name.lower()] = np.array(count_vect.get_feature_names())[decreasing_index[0:5]].tolist()
    
    # Use threshold to filter
    threshold = 0.2
    pipeline_associated_words[pipeline_name.lower()] = \
    list(filter(lambda x: x != '', map(lambda x, y: x if y >= threshold else '', count_vect.get_feature_names(), doc_tfidf_matrix.toarray()[0])))

    # print(market_associated_words[market_name.lower()])

In [76]:
pipeline_associated_words

{'-': ['ca', 'march', 'wcw'],
 'butte': ['butte', 'ok'],
 'clp-h': ['aaro', 'batch', 'ca', 'clkhard', 'clkhardwcs', 'outright', 'wcs'],
 'colt': ['colt', 'lol'],
 'dapl': ['dapl', 'jc', 'joco', 'johnsons', 'ok'],
 'deep rock': ['bls', 'io'],
 'double h': ['double', 'gsy', 'guernsey', 'korbi', 'morning'],
 'enb. nd': ['beaver', 'clearbrook', 'stanley', 'uhc'],
 'enb. t@s': ['cromer', 'uhc'],
 'enbridge': ['bpd', 'wcs'],
 'enterprise': ['hunt', 'midland'],
 'grand mesa': ['bpd', 'gm', 'grand', 'lucerne', 'mesa', 'wc'],
 'guern hub': ['goldma', 'peckerswood'],
 'hiland': ['dore', 'hiland', 'trento'],
 'husky': ['batch', 'wcs'],
 'newtown': ['ewtow'],
 'pxp': ['cents', 'pony', 'space'],
 'saddlehorn': ['bpd', 'lets', 'saddlehor'],
 'savage': ['fob', 'savage', 'trento'],
 'white cliffs': ['bpd', 'cliffs', 'lets', 'morning', 'wc', 'white']}

In [17]:
# with open('../data/processed/pipeline_associated_words.json', 'w') as fp:
#     json.dump(market_associated_words, fp)