# Imports

In [1]:
#Script to extract important topics from content
#originally written by: vipul-sharma20
#modifications made by: jadekhiev

# imports
import os
import sys
from pathlib import Path

# imports required utility functions
import string
from collections import Counter

# Data packages
import math
import pandas as pd
import numpy as np

#Operation
import operator

#Natural Language Processing Packages
import re
import nltk

from nltk import tokenize
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('brown')
from nltk.corpus import brown

#Progress bar
from tqdm import tqdm

[nltk_data] Downloading package punkt to /Users/jadekhiev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/jadekhiev/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
# Import articles
def importData(filename):
    """
    Import data into df
    """
    #Import Labelled Data
    DATA_DIR = "Data"
    thispath = Path().absolute()
    ARTICLES = os.path.join(DATA_DIR, filename)
    
    df = pd.read_excel(ARTICLES)

    try:
        df.head()
    except:
        pass
    return df

# PoS Tagger and CFG Set Up

In [3]:
# train tagger with browns news corpus
train = brown.tagged_sents(categories='news')

# custom regex tagging
regex_tag = nltk.RegexpTagger([
     #(r'[$][0-9]+\s[MmBbTt]\S+','DV'), #dollar value 
     (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
     (r'.*able$', 'JJ'),
     (r'^[A-Z].*$', 'NNP'),
     (r'.*ly$', 'RB'),
     (r'.*s$', 'NNS'),
     (r'.*ing$', 'VBG'),
     (r'.*ed$', 'VBD'),
     (r'.[\/\/]\S+', 'URL'), #URL / useless
     (r'.*', 'NN')
])

unigram_tag = nltk.UnigramTagger(train, backoff=regex_tag)
bigram_tag = nltk.BigramTagger(train, backoff=unigram_tag)
trigram_tag = nltk.TrigramTagger(train, backoff=bigram_tag)

# PoS Browns Corpus Tagging: https://en.wikipedia.org/wiki/Brown_Corpus
# custom defined Context Free Grammar (CFG) by vipul
cfg = dict()
cfg['NNP+NNP'] = 'NNP'
cfg['NN+NN'] = 'NNI'
cfg['NNP+NNI'] = 'NNI'
cfg['NNI+NN'] = 'NNI'
cfg['NNI+NNI'] = 'NNI'
cfg['NNI+NNP'] = 'NNI'
cfg['JJ+JJ'] = 'JJ'
cfg['JJ+NN'] = 'NNI'
cfg['CD+CD'] = 'CD'
cfg['NPI+NNP'] = 'NNP' # this is specific for collecting terms with the word deal
cfg['NNI+RP'] = 'NNI' # collects terms like "heats up" -- RP = adverb particle
cfg['RB+NN'] = 'NNP'# combination for monetary movement e.g. quarterly[RB] profit[NN] fell [VBD] -- RB = adverb
cfg['NNP+VBD'] = 'VPI' #VBP = a verb phrase
cfg['MD+VB'] = 'VPI' # collects terms like "will lose" (verb phrase incomplete)
cfg['MD+NN'] = 'VPI' # collects terms like "will soar" (verb phrase incomplete)
cfg['VPI+NN'] = 'VP' # collects terms like "will lose ground"
cfg['NNI+VP'] = 'VP' # collects terms like "index will soar"
cfg['NN+VPI'] = 'VP' # collects terms like "index will soar"
cfg['NNP+VPI'] = 'VP' # collects terms like "index will soar"
cfg['VPI+TO'] = 'VPI' # collect past participle verbs with to e.g. pledged to
cfg['VBN+TO'] = 'VBN' # collect past participle verbs with to e.g. pledged to
cfg['VBN+NN'] = 'VP' # collects terms like "pledged to adapt"

In [4]:
# Utility functions for context extraction
def getWords(sentence):
    stopwords = [
        # months
        "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december",
        # symbols that don't separate a sentence
        '$','“','”','’','—',
        # specific article terms that are useless
        "read", "share", "file", "'s","i", "photo", "percent","s", "t", "inc.", "corp", "group", "inc", "corp.", "source", "bloomberg", "cnbc",
        # useless pronouns
        "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "co.", "inc.",
        # etc
        "the", "a", "of", "have", "has", "had", "having"
        #"am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "while", "of", "at", "by", "for", "about", "into", "through", "during", "before", "after", "to", "from", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "just", "don", "now"
        ]
    words = word_tokenize(sentence)
    words = [word for word in words if word.lower() not in stopwords and len(word)>2]
    #print(words)
    return words

def countWords(wordList):
    return dict(Counter(wordList))

def get_info(content):
    words = getWords(content)
    temp_tags = trigram_tag.tag(words)
    tags = re_tag(temp_tags)
    normalized = True
    while normalized:
        normalized = False
        #print("len tag: ", len(tags))
        #pp.pprint(DictGroupBy(tags))
        for i in range(0, len(tags) - 1):
            #print("i: ", i)
            tagged1 = tags[i]
            if i+1 >= len(tags) - 1:
                break
            tagged2 = tags[i+1]

            # when word = deal and next word is tagged IN (with, for, etc.) 
            if tagged1[0]=='deal' and tagged2[1]=='IN':
                tags.pop(i)
                tags.pop(i)
                re_tagged = tagged1[0] + ' ' + tagged2[0]
                pos='NPI'
                tags.insert(i, (re_tagged, pos))
                normalized = True

            else: 
                key = tagged1[1] + '+' + tagged2[1]
                pos = cfg.get(key)       
                if pos:
                    tags.pop(i)
                    tags.pop(i)
                    re_tagged = tagged1[0] + ' ' + tagged2[0]
                    tags.insert(i, (re_tagged, pos))
                    normalized = True

    final_context = []
    for tag in tags:
        if tag[1] == 'NNP' or tag[1] == 'NNI' or tag[1] == 'VP':
            final_context.append(tag[0])
    return final_context


def re_tag(tagged):
    new_tagged = []
    for tag in tagged:
        if tag[1] == 'NP' or tag[1] == 'NP-TL':
            new_tagged.append((tag[0], 'NNP'))
        elif tag[1][-3:] == '-TL':
            new_tagged.append((tag[0], tag[1][:-3]))
        elif tag[1][-1:] == 'S':
            new_tagged.append((tag[0], tag[1][:-1]))
        else:
            new_tagged.append((tag[0], tag[1]))
    return new_tagged

In [5]:
# extract all unigrams based on all words pulled from context extraction
def unigramBreakdown(fullContext):
    # to be used as frequency count
    stopwords = ["myself", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "him", "his", "himself", "she", "her", "hers", "herself", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "are", "was", "were", "been", "being", "have", "has", "had", "having", "does", "did", "doing",  "the", "and", "but", "if", "or", "because", "until", "while", "for", "with", "about", "into", "through", "during", "before", "after", "from", "down", "out", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "nor", "not", "only", "own", "same", "than", "too", "very", "can", "will", "just", "don", "should", "now", "past", "year", "month", "day"]   
    
    # separates each word for each article => list of list
    articleUnigrams = []
    for term in fullContext:
        articleUnigrams.extend(term.split())
    
    # remove stop words and punctuation
    translator = str.maketrans('', '', string.punctuation)
    unigrams = [term.lower().translate(translator) for term in articleUnigrams if term.lower() not in stopwords and len(term)>2]
    # count frequency of terms
    # unigrams = countWords(unigrams)
    
    return unigrams

#extracts unigrams AND bigrams pulled by context extraction
def bigramBreakdown(fullContext):
    bigrams = []
    
    # remove stop words and punctuation
    translator = str.maketrans('', '', string.punctuation)
    bigrams.extend([term.lower().translate(translator) for term in fullContext if len(term.split()) < 3])
    
    return bigrams

In [7]:
# Retrieve context
def retrieveContext(filename):
    # import relevant articles
    articleDf = importData(filename)
    
    for i in articleDf.index:
        # get context for articles
        keyterms = get_info(articleDf['content'].iloc[i])
        articleDf.at[i, 'context'] = ', '.join(keyterms)
        
        # separate keyterms pulled from context extraction to get unigrams
        # this will be used to identify trending words
        articleDf.at[i, 'unigrams'] = ', '.join(unigramBreakdown(keyterms))
        
        # create list of bigrams and unigrams captured by context extraction
        articleDf.at[i, 'bigrams'] = ', '.join(bigramBreakdown(keyterms))
    
    #Save as excel file (better because weird characters encoded correctly)
    """
    DATA_DIR = "Data"
    OUTPUT_DIR = os.path.join(DATA_DIR, "results_context.xlsx")
    writer = pd.ExcelWriter(OUTPUT_DIR)
    articleDf.to_excel(writer,'Sheet1')
    writer.save()
    """
    return articleDf

In [9]:
articleDf = retrieveContext("results_encoding.xlsx")

In [167]:
sampleDf = articleDf.head(100)

In [29]:
sampleDf

Unnamed: 0,nonRel,Rel,article_id,prediction,difference,title,description,url,date,content,context,unigrams,bigrams
0,-1.890659,-0.163663,563,1,1.726996,Mexico’s new president presents a sober budget,"A NDRéS MANUEL LóPEZ OBRADOR, who became Mexic...",https://www.economist.com/the-americas/2018/12...,2018-12-17 15:03:15,A NDRS MANUEL LPEZ OBRADOR who became Mexicos ...,"Mexicos president 1st, thousands people, free ...","mexicos, president, 1st, thousands, people, fr...","thousands people, free moonlit, oscar, los pin..."
1,-1.682712,-0.205634,573,1,1.477078,Italy Says It Struck Deficit Compromise With EU,Italy’s finance ministry said it has agreed on...,https://www.wsj.com/articles/italy-says-it-str...,2018-12-18 21:42:15,ROMEItalys finance ministry said it has agreed...,"ROMEItalys, European Union authorities, countr...","romeitalys, european, union, authorities, coun...","romeitalys, fiscal enforcers, financial market..."
2,-1.626894,-0.218827,527,1,1.408068,"The invention, slow adoption and near perfecti...",I T IS A little bit of magic. A gesture up and...,https://www.economist.com/christmas-specials/2...,2018-12-17 11:48:13,I T IS A little bit of magic A gesture up and ...,"bit magic gesture, together gesture, apart zip...","bit, magic, gesture, together, gesture, apart,...","together gesture, apart zip, revolution, light..."
3,-1.606125,-0.223973,833,1,1.382152,Expect the unexpected: Here are five black swa...,Some events that no one is thinking about but ...,https://business.financialpost.com/investing/b...,2019-01-03 12:14:20,Britains departure from the European Union wil...,"Britains, European Union, chaotic mess Trade t...","britains, european, union, chaotic, mess, trad...","britains, european union, trump, italian bank,..."
4,-1.581368,-0.230286,213,1,1.351082,CNBC'S EUGENE KIM: TOP AMAZON EXEC WHO WAS LAR...,,https://www.cnbc.com/2018/12/18/cnbcs-eugene-k...,2018-12-18 20:26:00,Eugene KimDiego Piacentini one of the most sen...,"Eugene KimDiego Piacentini, senior executives ...","eugene, kimdiego, piacentini, senior, executiv...","italian, amazon, matterhis departure, given he..."


In [53]:
keyterms = []
for article in sampleDf['bigrams'].values:
    keyterms.extend([word.lstrip() for word in (article.split(','))])
    
keyterms = set(keyterms) # deduplicate using set cast and then cast again as a list

In [44]:
keyterms = (set(keyterms)) # deduplicate using set cast and then cast again as a list

['broken zip',
 'revolution',
 'trump',
 'tesla',
 'iraq',
 'still doubters',
 'punks',
 'expected year',
 'obvious candidate',
 'android',
 'rubber galoshes',
 'published budget',
 'new taxes',
 'american army',
 'yearspiacentinis departure',
 'dispatch',
 'pneumatic streetcar',
 'cnbcs',
 'become custom',
 'nbcuniversal',
 'attachment modesty',
 'james thomson',
 'overlap buttons',
 'allowed clothing',
 'noticeable zip',
 'amlo',
 'china',
 'zippers',
 'mercedes',
 'eventually burden',
 'primary surplus',
 '6bn bonds',
 'swiss woman',
 'commission spokeswoman',
 'made statement',
 'together gesture',
 'thousands people',
 'consultation people',
 'catch sense',
 'france',
 'jerry seinfeld',
 'put progress',
 'disciplinary procedure',
 'directly himand',
 'become ubiquitousand',
 'brexit african',
 'main obsession',
 'regulatory pressure',
 'bowl bowl',
 'hard choices',
 'perhaps horn',
 'used buckles',
 'netflix',
 'frankly doesnt',
 'waterproof heat',
 'judsons',
 'found cranes',
 'm

In [62]:
# for each article and each keyword: give 1 if keyword in article and 0 if not
encodedArticle = []
for i in sampleDf.index:
    articleTerms = ([word.lstrip() for word in (sampleDf['bigrams'].iloc[i].split(','))])
    encodedArticle.append([1 if word in articleTerms else 0 for word in keyterms])

In [65]:
# set up 
binEncDf = pd.DataFrame(encodedArticle)
# using keywords as columns
binEncDf.columns = keyterms
# combine article_id and prediction from original table
binEncDf = sampleDf[['article_id','prediction']].join(binEncDf)

In [66]:
binEncDf

Unnamed: 0,article_id,prediction,broken zip,revolution,trump,tesla,iraq,still doubters,punks,expected year,...,deep bench,particularly footware,homes worldwide,fresh opportunities,threats impeachment,snaps,improved zip,swans events,italys,oil production
0,563,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,573,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,527,1,1,1,0,0,1,1,1,0,...,0,1,0,0,0,0,1,0,0,0
3,833,1,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,213,1,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,0,0,0


# PMI For Ranking Tags

In [192]:
# return binary representation of article in terms of all keyphrases pulled
def dfTransform(df, term_column):
    # df is the article df ; term_col is name of column containing keyterms -- can be unigrams, bigrams, named entities, etc.
    keyterms = []
    for article in df[term_column].values:
        keyterms.extend([word.lstrip() for word in (article.split(','))])
    keyterms = set(keyterms) # deduplicate terms by casting as set
    
    # for each article and each keyword: give 1 if keyword in article and 0 if not
    encodedArticle = []
    for i in df.index:
        articleTerms = ([word.lstrip() for word in (df[term_column].iloc[i].split(','))])
        encodedArticle.append([1 if word in articleTerms else 0 for word in keyterms])
    
    # set up dataframe
    binEncDf = pd.DataFrame(encodedArticle)
    # use keywords as columns
    binEncDf.columns = keyterms
    # keep article_id and prediction from original table
    df = df.rename(columns={'prediction': 'mktMoving'}) # changed it from prediction because that was also a keyterm
    binEncDf = df[['article_id','mktMoving']].join(binEncDf)
    
    return binEncDf

# Simple example of getting pairwise mutual information of a term
def pmiCal(df, x, label_column='mktMoving'):
    pmilist=[]
    for i in [0,1]:
        for j in [0,1]:
            px = sum(df[label_column]==i)/len(df)
            py = sum(df[x]==j)/len(df)
            pxy = len(df[(df[label_column]==i) & (df[x]==j)])/len(df)
            if pxy==0:#Log 0 cannot happen
                pmi = math.log((pxy+0.0001)/(px*py+0.0001))
            else:
                pmi = math.log(pxy/(px*py+0.0001))
            pmilist.append([i]+[j]+[px]+[py]+[pxy]+[pmi])
    pmiDf = pd.DataFrame(pmilist)
    pmiDf.columns = ['x','y','px','py','pxy','pmi']
    return pmiDf

def pmiIndivCal(df,x,gt, label_column='mktMoving'):
    px = sum(df[label_column]==gt)/len(df)
    py = sum(df[x]==1)/len(df)
    pxy = len(df[(df[label_column]==gt) & (df[x]==1)])/len(df)
    if pxy==0:#Log 0 cannot happen
        pmi = math.log((pxy+0.0001)/(px*py+0.0001))
    else:
        pmi = math.log(pxy/(px*py))
    return pmi

# Compute PMI for all terms and all possible labels
def pmiForAllCal(artDf, binaryEncDf, term_column, label_column='mktMoving'): 
    #Try calculate all the pmi for top k and store them into one pmidf dataframe
    #pmilist = []
    for i in tqdm(artDf.index): # for all articles
        # for each term in the article
        terms = set(([word.lstrip() for word in (artDf[term_column].iloc[i].split(','))]))
        pmiposlist = []
        #pmineglist = []

        for word in terms:
            #pmilist.append([word]+[pmiCal(df,word)])
            pmiposlist.append([word]+[pmiIndivCal(binaryEncDf,word,1,label_column)])
            #pmineglist.append([word]+[pmiIndivCal(binaryEncDf,word,0,label_column)])
        
        #pmiDf = pd.DataFrame(pmilist)
        pmiposlist = pd.DataFrame(pmiposlist)
        #pmineglist = pd.DataFrame(pmineglist)
        pmiposlist.columns = ['word','pmi']
        #pmineglist.columns = ['word','pmi']
        artDf.at[i,'tags_top10'] = (',').join(word for word in pmiposlist.sort_values(by='pmi', ascending=False).head(10)['word'])
        #pmineglist.sort_values(by='pmi', ascending=True).head(10)['word']
        #pmiDf.columns = ['word','pmi']
        
    return artDf #, pmineglist , pmiDf
    

In [None]:
def calculatePMI(articleDf, termType):
    # use PMI to calculate top 3 terms that should be displayed for each article    
    # get binary encoding of articles represented as uni- and bigrams
    binaryEncDf = dfTransform(articleDf, termType)
    
    articleDf_ranked = pmiForAllCal(articleDf, binaryEncDf, termType)
    return articleDf_ranked

In [178]:
binaryEncDf = dfTransform(sampleDf, 'bigrams')

In [182]:
#binaryEncDf

In [193]:
articleDf2 = pmiForAllCal(sampleDf, binaryEncDf, 'bigrams')

100%|██████████| 100/100 [00:09<00:00, 10.89it/s]


In [190]:
articleDf2

Unnamed: 0,nonRel,Rel,article_id,prediction,difference,title,description,url,date,content,context,unigrams,bigrams,tags_top10
0,-1.890659,-0.163663,563,1,1.726996,Mexico’s new president presents a sober budget,"A NDRéS MANUEL LóPEZ OBRADOR, who became Mexic...",https://www.economist.com/the-americas/2018/12...,2018-12-17 15:03:15,A NDRS MANUEL LPEZ OBRADOR who became Mexicos ...,"Mexicos president 1st, thousands people, free ...","mexicos, president, 1st, thousands, people, fr...","thousands people, free moonlit, oscar, los pin...","gradually government,expensive scheme,73bn pes..."
1,-1.682712,-0.205634,573,1,1.477078,Italy Says It Struck Deficit Compromise With EU,Italy’s finance ministry said it has agreed on...,https://www.wsj.com/articles/italy-says-it-str...,2018-12-18 21:42:15,ROMEItalys finance ministry said it has agreed...,"ROMEItalys, European Union authorities, countr...","romeitalys, european, union, authorities, coun...","romeitalys, fiscal enforcers, financial market...","frances,economic slowdown,brussels,italys,soug..."
2,-1.626894,-0.218827,527,1,1.408068,"The invention, slow adoption and near perfecti...",I T IS A little bit of magic. A gesture up and...,https://www.economist.com/christmas-specials/2...,2018-12-17 11:48:13,I T IS A little bit of magic A gesture up and ...,"bit magic gesture, together gesture, apart zip...","bit, magic, gesture, together, gesture, apart,...","together gesture, apart zip, revolution, light...","broken zip,developingworld demand,button fly,i..."
3,-1.606125,-0.223973,833,1,1.382152,Expect the unexpected: Here are five black swa...,Some events that no one is thinking about but ...,https://business.financialpost.com/investing/b...,2019-01-03 12:14:20,Britains departure from the European Union wil...,"Britains, European Union, chaotic mess Trade t...","britains, european, union, chaotic, mess, trad...","britains, european union, trump, italian bank,...","daimler,europes,become lot,guardevery year,pol..."
4,-1.581368,-0.230286,213,1,1.351082,CNBC'S EUGENE KIM: TOP AMAZON EXEC WHO WAS LAR...,,https://www.cnbc.com/2018/12/18/cnbcs-eugene-k...,2018-12-18 20:26:00,Eugene KimDiego Piacentini one of the most sen...,"Eugene KimDiego Piacentini, senior executives ...","eugene, kimdiego, piacentini, senior, executiv...","italian, amazon, matterhis departure, given he...","piacentini,italian,storypiacentini joins,retur..."
5,-1.556540,-0.236817,720,1,1.319723,"Nasdaq enters bear market territory, down 20% ...",The Nasdaq entered bear market territory Thurs...,https://www.cnbc.com/2018/12/20/nasdaq-enters-...,2018-12-20 18:26:00,The Nasdaq Composite entered bear market terri...,"Nasdaq, market territory, Wall, pricey technol...","nasdaq, market, territory, wall, pricey, techn...","nasdaq, market territory, wall, nasdaq, nasdaq...","trimmed buoyant,companies,earnings numbers,ene..."
6,-1.544442,-0.240075,240,1,1.304368,Where You Should Move to Make the Most Money: ...,A tech-driven concentration of talent since th...,https://www.wsj.com/articles/where-you-should-...,2018-12-15 05:09:31,Technology is creating an economy in which sup...,"economy superstar employees work, superstar fi...","economy, superstar, employees, work, superstar...","superstar firms, superstar cities, apple, ulti...","then metros,selfreliance,distinct ways,ultimat..."
7,-1.506074,-0.250745,347,1,1.255329,Malaysia pursues case against Goldman units ov...,"UK, Singapore and Hong Kong entities targeted,...",https://www.ft.com/content/26ff2dc2-02a2-11e9-...,2018-12-18 13:15:37,Malaysia is to pursue criminal charges against...,"Malaysia pursue criminal charges, Goldman Sach...","malaysia, pursue, criminal, charges, goldman, ...","goldman sachs, singapore, hong kong, suggests ...","foreign officials,court orders,caroline binham..."
8,-1.465110,-0.262733,637,1,1.202378,"AMLO, Mexico’s new president, presents a surpr...",But the calm in the markets may not last,https://www.economist.com/the-americas/2018/12...,2018-12-17 00:00:00,ANDRS MANUEL LPEZ OBRADOR who became Mexicos p...,"Mexicos president 1st, thousands people, free ...","mexicos, president, 1st, thousands, people, fr...","thousands people, free moonlit, oscar, los pin...","gradually government,expensive scheme,6bn bond..."
9,-1.374338,-0.291700,391,1,1.082638,Individual Investors Try Not to Panic Over Big...,"For stock-market investors, times of heightene...",https://www.wsj.com/articles/retail-investors-...,2019-01-02 03:06:46,The yearend stock selloff saddled major indexe...,"yearend stock selloff, major indexes, annual d...","yearend, stock, selloff, major, indexes, annua...","major indexes, annual decline, global growth, ...","about trade,moneymarket funds,annual decline,e..."


In [152]:
pos = [word for word in pmiposlist.sort_values(by='pmi', ascending=False).head(10)['word'].values]

In [153]:
pos

['revenue year',
 'rochester',
 'judsons',
 'gospel singer',
 'back equity',
 'syrias kurdish',
 'ultimately accommodate',
 'sense mismanagement',
 'side coin',
 'airdrop']

In [156]:
print((',').join(pos))

revenue year,rochester,judsons,gospel singer,back equity,syrias kurdish,ultimately accommodate,sense mismanagement,side coin,airdrop
