In [None]:
import pandas as pd, numpy as np, os, datetime, time, re, matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

## Read data, remove duplicates, drop NaN rows
NY Times, Washington Post, WSJ

In [None]:
### Read NY Times
dfNYT = pd.read_csv('/mnt/data/NYT.txt',encoding='utf-8',sep='|',index_col=0)
### Read Washington Post (Need to remove duplicates)
dfWP = pd.read_csv('/mnt/data/WashingtonPost.txt',encoding='utf-8',sep='|',index_col=0)
### Read Wall Street Journal (Need to remove NaN rows and duplicates)
dfWSJ = pd.read_csv('/mnt/data/WSJ.txt',encoding='utf-8',sep='|',index_col=0)

In [None]:
### Remove Washington Post duplicates
dfWP.drop_duplicates(subset=['content'],keep='first',inplace=True)
dfWP.reset_index(drop=True,inplace=True)
### Remove Wall Street Journal NaN rows and duplicates
dfWSJ.dropna(subset=['Cleaned_text'],inplace=True)
dfWSJ.drop_duplicates(subset=['Cleaned_text'],keep='first',inplace=True)
dfWSJ.reset_index(drop=True,inplace=True)

## Run text analysis

In [None]:
AllTexts = dfNYT['new_text']
AllTexts = AllTexts.append(dfWP['content'])
AllTexts = AllTexts.append(dfWSJ['Cleaned_text']).reset_index(drop=True)

In [None]:
TEXTS = list(AllTexts)

In [None]:
import sys, os, codecs
scriptpath = '/home/ubuntu/Codes/Text-Analytics-Module-master/Code'
sys.path.append(scriptpath)
import basic_text_processing_functions as tx
from basic_text_processing_functions import *

In [None]:
import spacy
nlp = spacy.load('en')          # For spacy
# import en_core_web_sm           # For windows
# nlp = en_core_web_sm.load()     # For windows

In [None]:
pathloc = '/mnt/data/TextAnalysis/'

In [None]:
pnodes = 16
if 1==0:
    with codecs.open(pathloc+'default.cfg','w',encoding='utf-8') as f:
        f.write(json.dumps({'batch_size':1000,'n_threads':pnodes,'fpathroot':pathloc,'fpathappend':u'','entity_sub':True}))
    batch_size,n_threads,fpathroot,fpathappend,entity_sub,numtopics = tx._config_text_analysis_(pathloc+'default.cfg')
else:
    batch_size,n_threads,fpathroot,fpathappend,entity_sub,numtopics = tx._config_text_analysis_(pathloc+'default.cfg')

In [None]:
tx.batch_size,tx.n_threads,tx.fpathroot,tx.fpathappend,tx.entity_sub,tx.numtopics = batch_size,n_threads,fpathroot,fpathappend,entity_sub,numtopics

In [None]:
tx.fpathroot, fpathroot, pathloc, fpathappend

In [None]:
tx.n_threads

In [None]:
### Write texts to disk
fname = 'rawtext.txt'
if 1==0:
    with codecs.open(pathloc+fname,'w',encoding='utf-8') as f:
        for t in TEXTS:
            try:
#                 f.write(t.replace('\n',' ')+u'\n')
                f.write(u' '.join(t.splitlines())+u'\n')
            except:
                pass

In [None]:
if 1==0:
    tx._write_unigram_(pathloc+fname,unigram_sentences_filepath=fpathroot+fpathappend+'_sent_gram_0.txt',entity_sub=True)

In [None]:
# Create Phrase Models
passes = 2
if 1==0:
    ngrams = tx._phrase_detection_(fpath=tx.fpathroot+tx.fpathappend,passes=passes,returnmodels=True,threshold=10.)
else:
    ngrams = list()
    gramfiles = os.listdir(tx.fpathroot)
    phrasemodels = [tx.fpathroot+g for g in gramfiles if 'phrase_model' in g]
    for m in phrasemodels:
        ngrams.append(Phrases.load(m))

In [None]:
regram = 1 # change to 1 to organize bags of words by documents (vs. sentences)
if regram==1:
    if 1==0:
        grammed_texts = tx._phrase_prediction_(pathloc+fname,ngrams,outfpath=fpathroot+fpathappend+'grammed_texts.txt',entity_sub=True)
    else:
        grammed_texts = tx.fpathroot+'grammed_texts.txt'
else:
    grammed_texts = tx.fpathroot+'sent_gram_{}.txt'.format(passes)

In [None]:
from gensim import corpora
### Create Dictionary
if 1==0:
    vocab,gensim_dictionary,cts = tx._make_dict_(grammed_texts,floc=fpathroot+fpathappend+'dict_gram.dict',
                                                 topfilter=99,bottomfilter=25,no_filters=False,keep_ent=False,
                                                 discard_list={},keep_list={})
    print(len(vocab))
else:
    gensim_dictionary = corpora.Dictionary.load(tx.fpathroot+'dict_gram.dict')
    print(len(gensim_dictionary))

In [None]:
if 1==0:
    grammed_corpus = tx._serialize_corpus_(grammed_texts,gensim_dictionary,outfpath=fpathroot+fpathappend+'_serialized.mm')
else:
    grammed_corpus_loc = tx.fpathroot+tx.fpathappend+'_serialized.mm'
    grammed_corpus = MmCorpus(grammed_corpus_loc)

In [None]:
### Perform LDA
numtopics = 5
ldafile = 'lda_'+str(numtopics)
if 1==0:
    lda = tx._lda_(gensim_dictionary,corpus_path=grammed_corpus,numtopics=numtopics,iterations=100) # defaults to 10 topics
    lda.save(pathloc+ldafile)
else: 
    lda = LdaMulticore.load(pathloc+ldafile)
lda.minimum_probability = 0.0

In [None]:
### Create Visualization of Topics
import pyLDAvis.gensim as ldavis
if 1==0:
    ldaviz = ldavis.prepare(lda,grammed_corpus,gensim_dictionary)
    pyLDAvis.save_html(ldaviz,pathloc+'viz_'+ldafile+'.html')
lda.show_topics()

In [None]:
numrange = np.arange(5,115,5)
numtopiclist = [a for a in numrange if a not in [50,70,100]]
### Perform LDA
for numtopics in numtopiclist:
    ldafile = 'lda_'+str(numtopics)
    if 1==0:
        lda = tx._lda_(gensim_dictionary,corpus_path=grammed_corpus,numtopics=numtopics,iterations=100)
        lda.save(pathloc+ldafile)
    else: 
        lda = LdaMulticore.load(pathloc+ldafile)
    lda.minimum_probability = 0.0
    if 1==0:
        ldaviz = ldavis.prepare(lda,grammed_corpus,gensim_dictionary)
        pyLDAvis.save_html(ldaviz,pathloc+'viz_'+ldafile+'.html')

## Predict back topic probabilities and get word counts
NY Times, Washington Post, WSJ

In [None]:
### Predict topics for NY Times
topicsNYT = dfNYT[['date','new_text']].rename(columns={'new_text':'content'})
topicsNYT.dropna(subset=['content'],inplace=True)
topicsNYT.reset_index(drop=True,inplace=True)
textNYT = list(topicsNYT['content'])

In [None]:
start_time = time.time()
topicsNYT['parsedtexts'] = tx._phrase_prediction_inmemory_(textNYT,ngrams)
topicsNYT['topic_probs'] = topicsNYT['parsedtexts'].apply(lambda x: dict(lda[gensim_dictionary.doc2bow(x.split())]))
for t in range(0,numtopics):
    topicsNYT['topic_'+str(t)] = topicsNYT.topic_probs.apply(lambda x: tx._try_iter_(x,t,errval=0.))
topicsNYT['word_ct'] = topicsNYT['content'].apply(lambda x: len(unicode(x).split()))
for t in range(0,numtopics):
    topicsNYT['word_ct_topic_'+str(t)] = topicsNYT.apply(lambda x: x['topic_'+str(t)]*x['word_ct'],axis=1)
run_time = (time.time()-start_time)/3600
print ("This takes %s hours to run" %run_time)

In [None]:
topicsNYT.dropna(subset=['parsedtexts'],inplace=True)
# topicsNYT.drop_duplicates(subset='parsedtexts',keep='first',inplace=True)
topicsNYT.reset_index(drop=True,inplace=True)

In [None]:
topicsNYT.to_csv('/mnt/data/TextAnalysis/topicsNYT.txt',sep='|',encoding='utf-8')

In [None]:
### Predict topics for Washington Post
topicsWP = dfWP[['date','content']].copy()
topicsWP.dropna(subset=['content'],inplace=True)
topicsWP.reset_index(drop=True,inplace=True)
textWP = list(topicsWP['content'])

In [None]:
start_time = time.time()
topicsWP['parsedtexts'] = tx._phrase_prediction_inmemory_(textWP,ngrams)
topicsWP['topic_probs'] = topicsWP['parsedtexts'].apply(lambda x: dict(lda[gensim_dictionary.doc2bow(x.split())]))
for t in range(0,numtopics):
    topicsWP['topic_'+str(t)] = topicsWP.topic_probs.apply(lambda x: tx._try_iter_(x,t,errval=0.))
topicsWP['word_ct'] = topicsWP['content'].apply(lambda x: len(unicode(x).split()))
for t in range(0,numtopics):
    topicsWP['word_ct_topic_'+str(t)] = topicsWP.apply(lambda x: x['topic_'+str(t)]*x['word_ct'],axis=1)
run_time = (time.time()-start_time)/3600
print ("This takes %s hours to run" %run_time)

In [None]:
topicsWP.dropna(subset=['parsedtexts'],inplace=True)
# topicsWP.drop_duplicates(subset='parsedtexts',keep='first',inplace=True)
topicsWP.reset_index(drop=True,inplace=True)

In [None]:
topicsWP.to_csv('/mnt/data/TextAnalysis/topicsWP.txt',sep='|',encoding='utf-8')

In [None]:
### Predict topics for WSJ
topicsWSJ = dfWSJ[['Publication date','Cleaned_text']].rename(columns={'Publication date':'date',
                                                                       'Cleaned_text':'content'})
topicsWSJ.dropna(subset=['content'],inplace=True)
topicsWSJ.reset_index(drop=True,inplace=True)
textWSJ = list(topicsWSJ['content'])

In [None]:
start_time = time.time()
topicsWSJ['parsedtexts'] = tx._phrase_prediction_inmemory_(textWSJ,ngrams)
topicsWSJ['topic_probs'] = topicsWSJ['parsedtexts'].apply(lambda x: dict(lda[gensim_dictionary.doc2bow(x.split())]))
for t in range(0,numtopics):
    topicsWSJ['topic_'+str(t)] = topicsWSJ.topic_probs.apply(lambda x: tx._try_iter_(x,t,errval=0.))
topicsWSJ['word_ct'] = topicsWSJ['content'].apply(lambda x: len(unicode(x).split()))
for t in range(0,numtopics):
    topicsWSJ['word_ct_topic_'+str(t)] = topicsWSJ.apply(lambda x: x['topic_'+str(t)]*x['word_ct'],axis=1)
run_time = (time.time()-start_time)/3600
print ("This takes %s hours to run" %run_time)

In [None]:
topicsWSJ.dropna(subset=['parsedtexts'],inplace=True)
# topicsWSJ.drop_duplicates(subset='parsedtexts',keep='first',inplace=True)
topicsWSJ.reset_index(drop=True,inplace=True)

In [None]:
topicsWSJ.to_csv('/mnt/data/TextAnalysis/topicsWSJ.txt',sep='|',encoding='utf-8')

In [None]:
### Plot topic probabilities of NY Times
for t in range(0,numtopics):
    plt.figure()
    np.log(topicsNYT['topic_'+str(t)]).hist()

In [None]:
### Plot topic probabilities of Washington Post
for t in range(0,numtopics):
    plt.figure()
    np.log(topicsWP['topic_'+str(t)]).hist()

In [None]:
### Plot topic probabilities of WSJ
for t in range(0,numtopics):
    plt.figure()
    np.log(topicsWSJ['topic_'+str(t)]).hist()

## Calculate relevance(term w | topic t)
NY Times, Washington Post, WSJ

In [None]:
# Dictionary assigns words to ids
# Corpus counts frequencies of words in each document

In [None]:
# # ldatopictopterms = lda.print_topic(16,topn=30)
# ldatopictopterms = lda.show_topic(16,topn=30)
# ldatopictopids = lda.get_topic_terms(16,topn=30)

In [None]:
N = 50
### Find probabilities of top N terms in a topic for all topics = p(w|t)
ProbTopTerms = list()
ProbTopIDs = list()
for t in range(0,numtopics):
    ProbTopTerms.append(lda.show_topic(t,topn=N))
    ProbTopIDs.append(lda.get_topic_terms(t,topn=N))

In [None]:
ProbTopTerms

In [None]:
ProbTopIDs

In [None]:
gensim_dictionary.values()

In [None]:
len(gensim_dictionary)

In [None]:
newcorpus = [item for sublist in grammed_corpus for item in sublist]
len(newcorpus)

In [None]:
newcorpus

In [None]:
dfcorpus = pd.DataFrame(newcorpus,columns=['termID','count'])
dfcorpus

In [None]:
### Count term frequencies in the entire corpus
dfcorpusgroup = dfcorpus.groupby(pd.Grouper(key='termID')).sum()
dfcorpusgroup.reset_index(level=0,inplace=True)
dfcorpusgroup = dfcorpusgroup.sort_values(by=['termID']).reset_index(drop=True)
dfcorpusgroup

In [None]:
topicsNYT = pd.read_csv('/mnt/data/TextAnalysis/topicsNYT.txt',encoding='utf-8',sep='|',index_col=0)
topicsWP = pd.read_csv('/mnt/data/TextAnalysis/topicsWP.txt',encoding='utf-8',sep='|',index_col=0)
topicsWSJ = pd.read_csv('/mnt/data/TextAnalysis/topicsWSJ.txt',encoding='utf-8',sep='|',index_col=0)

In [None]:
### NY Times, Washington Post, WSJ
topicsNYTnew = topicsNYT.iloc[:,:(numtopics+5)].copy()
topicsWPnew = topicsWP.iloc[:,:(numtopics+5)].copy()
topicsWSJnew = topicsWSJ.iloc[:,:(numtopics+5)].copy()

In [None]:
# Method 1: use word counts from contents
wc1 = topicsNYTnew['word_ct'].sum()
wc2 = topicsWPnew['word_ct'].sum()
wc3 = topicsWSJnew['word_ct'].sum()
wordcount1 = wc1+wc2+wc3
wordcount1

In [None]:
dfcorpusgroup1 = dfcorpusgroup.copy()
dfcorpusgroup1['docfreq'] = dfcorpusgroup1['count']/wordcount1
dfcorpusgroup1

In [None]:
# Method 2: use word counts from the whole corpus
wordcount2 = dfcorpusgroup['count'].sum()
wordcount2

In [None]:
dfcorpusgroup2 = dfcorpusgroup.copy()
dfcorpusgroup2['docfreq'] = dfcorpusgroup2['count']/wordcount2
dfcorpusgroup2

In [None]:
# Calculate relevance of each term
ld = 0.6
RelevanceTerms = list()
for t,topic in enumerate(ProbTopIDs):
    result = list()
    for w,term in enumerate(topic):
        termID = term[0]
        word = ProbTopTerms[t][w][0]
        prob = term[1]
        docfreq = dfcorpusgroup2['docfreq'][int(termID)]
        relevance = ld*np.log(prob)+(1-ld)*np.log(prob/docfreq)
        result.append(tuple([termID,word,prob,docfreq,relevance]))
    RelevanceTerms.append(result)

In [None]:
# Find most relevant terms in each topic
topterms = list()
for t,topic in enumerate(RelevanceTerms):
    df = pd.DataFrame(topic,columns=['termID','term','prob','docfreq','relevance']).sort_values(by=['relevance'],ascending=False).reset_index(drop=True)
    terms = list(df['term'][0:20])
    topterms.append(terms)
topterms

## Predict the Gaussian Mixture Model

Links to documentation:

http://scikit-learn.org/stable/user_guide.html

http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture

In [None]:
from sklearn import mixture
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
n_components = 2             # Number of mixture components. Defaults to 1 
covariance_type = 'full'     # {'full','tied','diag','spherical'}. Defaults to 'full' 
tol = 0.001                  # Convergence threshold. EM iterations will stop when the lower bound average gain is below this threshold. Defaults to 1e-3=0.001 
reg_covar = 1e-06            # Non-negative regularization added to the diagonal of covariance. Allows to assure that the covariance matrices are all positive. Defaults to 1e-6 
max_iter = 100               # Number of EM iterations to perform. Defaults to 100 
n_init = 1                   # Number of initializations to perform. The best results are kept. Defaults to 1 
init_params = 'kmeans'       # {'kmeans','random'}. Method used to initialize the weights, the means and the precisions. Defaults to 'kmeans' 
weights_init = None          # User-provided initial weights. Defaults to None (weights are initialized using the init_params method) 
means_init = None            # User-provided initial means. Defaults to None (means are initialized using the init_params method) 
precisions_init = None       # User-provided initial precisions (inverse of the covariance matrices). Defaults to None (precisions are initialized using the 'init_params' method). The shape depends on 'covariance_type' 
random_state = None          # Defaults to None. If int, random_state is the seed used by the random number generator. If RandomState instance, random_state is the random number generator. If None, the random number generator is the RandomState instance used by np.random 
warm_start = False           # Defaults to False. If True, the solution of the last fitting is used as initialization for the next call of fit(). This can speed up convergence when fit is called several time on similar problems. 
verbose = 0                  # Defaults to 0. Enable verbose output. If 1 then it prints the current initialization and each iteration step. If greater than 1 then it prints also the log probability and the time needed for each step 
verbose_interval = 10        # Number of iteration done before the next print. Defaults to 10 

In [None]:
gmm = mixture.GaussianMixture(n_components=n_components,covariance_type=covariance_type)
gmm

In [None]:
# Predict Gaussian Mixture Model for all topics of NY Times
TopicArraysNYT = []
PredictTopicsNYT = []
for t in range(0,numtopics):
    TopicArray = np.asarray(topicsNYT['topic_'+str(t)])
    X = TopicArray.reshape(-1,1)
    gmm = gmm.fit(X)
    Z = gmm.predict(X)
    if np.bincount(Z)[0]<=np.bincount(Z)[1]:
        Y = (Z-1)*(Z-1)
    else:
        Y = Z
    TopicArraysNYT.append(X)
    PredictTopicsNYT.append(Y)
    plt.figure()
    plt.hist(Y)
    plt.title('Histogram for NY Times: topic_'+str(t))

In [None]:
# Predict Gaussian Mixture Model for all topics of Washington Post
TopicArraysWP = []
PredictTopicsWP = []
for t in range(0,numtopics):
    TopicArray = np.asarray(topicsWP['topic_'+str(t)])
    X = TopicArray.reshape(-1,1)
    gmm = gmm.fit(X)
    Z = gmm.predict(X)
    if np.bincount(Z)[0]<=np.bincount(Z)[1]:
        Y = (Z-1)*(Z-1)
    else:
        Y = Z
    TopicArraysWP.append(X)
    PredictTopicsWP.append(Y)
    plt.figure()
    plt.hist(Y)
    plt.title('Histogram for Washington Post: topic_'+str(t))

In [None]:
# Predict Gaussian Mixture Model for all topics of WSJ
TopicArraysWSJ = []
PredictTopicsWSJ = []
for t in range(0,numtopics):
    TopicArray = np.asarray(topicsWSJ['topic_'+str(t)])
    X = TopicArray.reshape(-1,1)
    gmm = gmm.fit(X)
    Z = gmm.predict(X)
    if np.bincount(Z)[0]<=np.bincount(Z)[1]:
        Y = (Z-1)*(Z-1)
    else:
        Y = Z
    TopicArraysWSJ.append(X)
    PredictTopicsWSJ.append(Y)
    plt.figure()
    plt.hist(Y)
    plt.title('Histogram for WSJ: topic_'+str(t))

## Keep relevant topics and remove irrelevant topics

In [None]:
topicsNYTbinary = topicsNYT.iloc[:,0:(numtopics+5)].copy()
for t in range(0,numtopics):
    topicsNYTbinary['binarytopic_'+str(t)] = PredictTopicsNYT[t]
topicsNYTbinary

In [None]:
topicsWPbinary = topicsWP.iloc[:,0:(numtopics+5)].copy()
for t in range(0,numtopics):
    topicsWPbinary['binarytopic_'+str(t)] = PredictTopicsWP[t]
topicsWPbinary

In [None]:
topicsWSJbinary = topicsWSJ.iloc[:,0:(numtopics+5)].copy()
for t in range(0,numtopics):
    topicsWSJbinary['binarytopic_'+str(t)] = PredictTopicsWSJ[t]
topicsWSJbinary

In [None]:
topicsNYTbinary = pd.read_csv('/mnt/data/TextAnalysis/topicsNYTbinary.txt',encoding='utf-8',sep='|',index_col=0)
topicsWPbinary = pd.read_csv('/mnt/data/TextAnalysis/topicsWPbinary.txt',encoding='utf-8',sep='|',index_col=0)
topicsWSJbinary = pd.read_csv('/mnt/data/TextAnalysis/topicsWSJbinary.txt',encoding='utf-8',sep='|',index_col=0)

## Plot topic trends over time

In [None]:
# https://machinelearningmastery.com/time-series-data-visualization-with-python/

In [None]:
dftopicsNYT = topicsNYTbinary.iloc[:,(numtopics+5):].copy()
dftopicsNYT['date'] = pd.to_datetime(topicsNYTbinary['date'])
dftopicsNYT

In [None]:
for i in dftopicsNYT.iloc[:,:-1].columns:
    df = dftopicsNYT[['date',i]].copy()
    dfgroup = df.groupby(pd.Grouper(key='date',freq='M')).mean()
    words = u', '.join(topterms[int(i[12:])][:10])
    plt.figure()
    plt.plot(dfgroup)
    plt.axvline(x='2011-03-31',color='r',linestyle='--')
    plt.title('NY Times '+i[6:]+' ('+words+') by month')

In [None]:
dftopicsWP = topicsWPbinary.iloc[:,(numtopics+5):].copy()
dftopicsWP['date'] = pd.to_datetime(topicsWPbinary['date'])
dftopicsWP

In [None]:
for i in dftopicsWP.iloc[:,:-1].columns:
    df = dftopicsWP[['date',i]].copy()
    dfgroup = df.groupby(pd.Grouper(key='date',freq='M')).mean()
    words = u', '.join(topterms[int(i[12:])][:10])
    plt.figure()
    plt.plot(dfgroup)
    plt.axvline(x='2013-06-30',color='r',linestyle='--')
    plt.title('Washington Post '+i[6:]+' ('+words+') by month')

In [None]:
dftopicsWSJ = topicsWSJbinary.iloc[:,(numtopics+5):].copy()
dftopicsWSJ['date'] = pd.to_datetime(topicsWSJbinary['date'])
dftopicsWSJ

In [None]:
for i in dftopicsWSJ.iloc[:,:-1].columns:
    df = dftopicsWSJ[['date',i]].copy()
    dfgroup = df.groupby(pd.Grouper(key='date',freq='M')).mean()
    words = u', '.join(topterms[int(i[12:])][:10])
    plt.figure()
    plt.plot(dfgroup)
#     plt.axvline(x='2011-03-31',color='r',linestyle='--')
    plt.title('WSJ '+i[6:]+' ('+words+') by month')

## Sentiment - polarity and subjectivity
Sentiment() returns polarity (positive ↔ negative) and subjectivity (objective ↔ subjective)

Modality() returns the degree of certainty as a value between -1.0 and +1.0, where values > +0.5 represent facts

Mood() returns either INDICATIVE, IMPERATIVE, CONDITIONAL or SUBJUNCTIVE for a given parsed sentence

In [None]:
import pandas as pd, numpy as np, os, datetime, time, re, matplotlib
import pattern.en as pat
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
topicsNYT = pd.read_csv('/mnt/data/TextAnalysis/topicsNYT.txt',encoding='utf-8',sep='|',index_col=0)
topicsWP = pd.read_csv('/mnt/data/TextAnalysis/topicsWP.txt',encoding='utf-8',sep='|',index_col=0)
topicsWSJ = pd.read_csv('/mnt/data/TextAnalysis/topicsWSJ.txt',encoding='utf-8',sep='|',index_col=0)

In [None]:
topicsNYTbinary = pd.read_csv('/mnt/data/TextAnalysis/topicsNYTbinary.txt',encoding='utf-8',sep='|',index_col=0)
topicsWPbinary = pd.read_csv('/mnt/data/TextAnalysis/topicsWPbinary.txt',encoding='utf-8',sep='|',index_col=0)
topicsWSJbinary = pd.read_csv('/mnt/data/TextAnalysis/topicsWSJbinary.txt',encoding='utf-8',sep='|',index_col=0)

In [None]:
patternNYT = topicsNYT[['date','content','parsedtexts']].copy()
patternWP = topicsWP[['date','content','parsedtexts']].copy()
patternWSJ = topicsWSJ[['date','content','parsedtexts']].copy()

In [None]:
### Start IPython Clusters
import ipyparallel as ipp
from ipyparallel import Client
client = Client() # run on local ipcluster
# client = Client('Your security group',
#                 sshkey='Your key')
lbview = client.load_balanced_view()
pnodes = len(client.ids)     # Number of nodes in the starcluster
print pnodes

In [None]:
@lbview.parallel(block=True)
def _pattern_(packet):
    import pandas as pd, time, datetime, os
    import pattern.en as pat
    dates, contents, parsedtexts = zip(*packet)
    df = pd.DataFrame()
    for i,d in enumerate(dates):
        content = contents[i]
        parsedtext = parsedtexts[i]
        result = []
        for index,text in enumerate(content):
            ptext = parsedtext[index]
            if type(ptext)==float:
                sent = (u'N/A',u'N/A')
            else:
                sent = pat.sentiment(text)
            result.append([d,text,ptext,sent[0],sent[1]])
        data = pd.DataFrame(result)
        df = df.append(data)
    df.columns = ['date','content','parsedtexts','polarity','subjectivity']
    return df

In [None]:
def chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0,len(l),n)]

#### NY Times

In [None]:
dates = np.unique(patternNYT['date']).tolist()
len(dates)

In [None]:
inputs = [(dt,list(patternNYT.loc[patternNYT.date==dt,'content']),
           list(patternNYT.loc[patternNYT.date==dt,'parsedtexts'])) for dt in dates]
len(inputs)

In [None]:
iterspernode = 1
ndates = chunks(inputs,max(int(len(inputs)/(pnodes*iterspernode)),1))

In [None]:
'each node gets {} dates'.format(len(ndates[0]))

In [None]:
outputNYT = pd.DataFrame()
start_time = time.time()
outputNYT = outputNYT.append(_pattern_.map(ndates))
run_time = (time.time()-start_time)/3600
print ("This takes %s hours to run" %run_time)

In [None]:
outputNYT = outputNYT.reset_index(drop=True)
outputNYT

In [None]:
outputNYT.to_csv('/mnt/data/TextAnalysis/patternNYT.txt',sep='|',encoding='utf-8')

#### Washington Post

In [None]:
dates = np.unique(patternWP['date']).tolist()
len(dates)

In [None]:
inputs = [(dt,list(patternWP.loc[patternWP.date==dt,'content']),
           list(patternWP.loc[patternWP.date==dt,'parsedtexts'])) for dt in dates]
len(inputs)

In [None]:
iterspernode = 1
ndates = chunks(inputs,max(int(len(inputs)/(pnodes*iterspernode)),1))

In [None]:
'each node gets {} dates'.format(len(ndates[0]))

In [None]:
outputWP = pd.DataFrame()
start_time = time.time()
outputWP = outputWP.append(_pattern_.map(ndates))
run_time = (time.time()-start_time)/3600
print ("This takes %s hours to run" %run_time)

In [None]:
outputWP = outputWP.reset_index(drop=True)
outputWP

In [None]:
outputWP.to_csv('/mnt/data/TextAnalysis/patternWP.txt',sep='|',encoding='utf-8')

#### Wall Street Journal

In [None]:
dates = np.unique(patternWSJ['date']).tolist()
len(dates)

In [None]:
inputs = [(dt,list(patternWSJ.loc[patternWSJ.date==dt,'content']),
           list(patternWSJ.loc[patternWSJ.date==dt,'parsedtexts'])) for dt in dates]
len(inputs)

In [None]:
iterspernode = 1
ndates = chunks(inputs,max(int(len(inputs)/(pnodes*iterspernode)),1))

In [None]:
'each node gets {} dates'.format(len(ndates[0]))

In [None]:
outputWSJ = pd.DataFrame()
start_time = time.time()
outputWSJ = outputWSJ.append(_pattern_.map(ndates))
run_time = (time.time()-start_time)/3600
print ("This takes %s hours to run" %run_time)

In [None]:
outputWSJ = outputWSJ.reset_index(drop=True)
outputWSJ

In [None]:
outputWSJ.to_csv('/mnt/data/TextAnalysis/patternWSJ.txt',sep='|',encoding='utf-8')

#### Plot sentiment

In [None]:
import pandas as pd, numpy as np, os, datetime, time, re, matplotlib
import pattern.en as pat
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
patternNYT = pd.read_csv('/mnt/data/TextAnalysis/patternNYT.txt',encoding='utf-8',sep='|',index_col=0,chunksize=10000)
patternNYT = pd.concat(patternNYT)
patternWP = pd.read_csv('/mnt/data/TextAnalysis/patternWP.txt',encoding='utf-8',sep='|',index_col=0)
patternWSJ = pd.read_csv('/mnt/data/TextAnalysis/patternWSJ.txt',encoding='utf-8',sep='|',index_col=0)

In [None]:
dfNYT = patternNYT[['date','polarity']].copy()
dfNYT['date'] = pd.to_datetime(dfNYT['date'])
dfWP = patternWP[['date','polarity']].copy()
dfWP['date'] = pd.to_datetime(dfWP['date'])
dfWSJ = patternWSJ[['date','polarity']].copy()
dfWSJ['date'] = pd.to_datetime(dfWSJ['date'])

In [None]:
dfNYT = outputNYT[['date','polarity']].copy()
dfNYT['date'] = pd.to_datetime(dfNYT['date'])
dfWP = outputWP[['date','polarity']].copy()
dfWP['date'] = pd.to_datetime(dfWP['date'])
dfWSJ = outputWSJ[['date','polarity']].copy()
dfWSJ['date'] = pd.to_datetime(dfWSJ['date'])

In [None]:
dfgroupNYT = dfNYT.groupby(pd.Grouper(key='date',freq='M')).mean()
dfgroupWP = dfWP.groupby(pd.Grouper(key='date',freq='M')).mean()
dfgroupWSJ = dfWSJ.groupby(pd.Grouper(key='date',freq='M')).mean()

In [None]:
plt.figure(figsize=(20,10))
plt.plot(dfgroupNYT)
plt.axvline(x='2011-03-31',color='r',linestyle='--')
plt.title('Polarity of NY Times by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Polarity', fontsize=16)
plt.savefig('/mnt/data/TextAnalysis-Figures/PolaritybyMonthNYT.png',bbox_inches='tight')

plt.figure(figsize=(20,10))
plt.plot(dfgroupWP)
plt.axvline(x='2013-06-30',color='r',linestyle='--')
plt.title('Polarity of Washington Post by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Polarity', fontsize=16)
plt.savefig('/mnt/data/TextAnalysis-Figures/PolaritybyMonthWP.png',bbox_inches='tight')

plt.figure(figsize=(20,10))
plt.plot(dfgroupWSJ)
plt.title('Polarity of Wall Street Journal by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Polarity', fontsize=16)
plt.savefig('/mnt/data/TextAnalysis-Figures/PolaritybyMonthWSJ.png',bbox_inches='tight')

#### Sentiment of titles vs sentiment of content

In [None]:
import pandas as pd, numpy as np, os, datetime, time, re, matplotlib
import pattern.en as pat
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
### Read NY Times
dfNYT = pd.read_csv('/mnt/data/TextAnalysis/NYT.txt',encoding='utf-8',sep='|',index_col=0,chunksize=10000)
dfNYT = pd.concat(dfNYT)
### Read Wall Street Journal (Need to remove NaN rows and duplicates)
dfWSJ = pd.read_csv('/mnt/data/TextAnalysis/WSJ.txt',encoding='utf-8',sep='|',index_col=0)
### Remove Wall Street Journal NaN rows and duplicates
dfWSJ.dropna(subset=['Cleaned_text'],inplace=True)
dfWSJ.drop_duplicates(subset=['Cleaned_text'],keep='first',inplace=True)
dfWSJ.reset_index(drop=True,inplace=True)

In [None]:
### Read pattern files
patternNYT = pd.read_csv('/mnt/data/TextAnalysis/patternNYT.txt',encoding='utf-8',sep='|',index_col=0,chunksize=10000)
patternNYT = pd.concat(patternNYT)
patternWSJ = pd.read_csv('/mnt/data/TextAnalysis/patternWSJ.txt',encoding='utf-8',sep='|',index_col=0)

In [None]:
dataNYT = dfNYT[['date','title','content','new_text']].rename(columns={'new_text':'cleaned_text'})
dataNYT['parsedtexts'] = patternNYT['parsedtexts']
dataNYT['polarity'] = patternNYT['polarity']
dataNYT['subjectivity'] = patternNYT['subjectivity']
dataNYT[:2]

In [None]:
dataWSJ = dfWSJ[['Publication date','Title','Full text','Cleaned_text']].rename(columns={'Publication date':'date',
                                                                                         'Title':'title',
                                                                                         'Full text':'content',
                                                                                         'Cleaned_text':'cleaned_text'})
dataWSJ['parsedtexts'] = patternWSJ['parsedtexts']
dataWSJ['polarity'] = patternWSJ['polarity']
dataWSJ['subjectivity'] = patternWSJ['subjectivity']
dataWSJ[:2]

In [None]:
### Start IPython Clusters
import ipyparallel as ipp
from ipyparallel import Client
client = Client() # run on local ipcluster
# client = Client('Your security group',
#                 sshkey='Your key')
lbview = client.load_balanced_view()
pnodes = len(client.ids)     # Number of nodes in the starcluster
print pnodes

In [None]:
@lbview.parallel(block=True)
def _pattern_title_(packet):
    import pandas as pd, numpy as np, time, datetime, os
    import pattern.en as pat
    dates, titles, contents, cleaned_texts, parsedtexts, polarities, subjectivities = zip(*packet)
    df = pd.DataFrame()
    for i,d in enumerate(dates):
        title = titles[i]
        content = contents[i]
        cleaned_text = cleaned_texts[i]
        parsedtext = parsedtexts[i]
        polarity = polarities[i]
        subjectivity = subjectivities[i]
        result = []
        for ind,val in enumerate(title):
            sent = pat.sentiment(val)
            result.append([d,val,content[ind],cleaned_text[ind],parsedtext[ind],polarity[ind],np.float64(sent[0]),
                           subjectivity[ind],np.float64(sent[1])])
        data = pd.DataFrame(result)
        df = df.append(data)
    df.columns = ['date','title','content','cleanedtext','parsedtext','contentpolarity','titlepolarity',
                  'contentsubjectivity','titlesubjectivity']
    return df

In [None]:
def chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0,len(l),n)]

In [None]:
dates = np.unique(dataNYT['date']).tolist()
len(dates)

In [None]:
inputs = []
for dt in dates:
    data = dataNYT.loc[dataNYT.date==dt]
    inputs.append((dt,list(data['title']),list(data['content']),list(data['cleaned_text']),list(data['parsedtexts']),
                   list(data['polarity']),list(data['subjectivity'])))
len(inputs)

In [None]:
iterspernode = 1
ndates = chunks(inputs,max(int(len(inputs)/(pnodes*iterspernode)),1))

In [None]:
'each node gets {} dates'.format(len(ndates[0]))

In [None]:
outputNYT = pd.DataFrame()
start_time = time.time()
outputNYT = outputNYT.append(_pattern_title_.map(ndates))
run_time = (time.time()-start_time)/60
print ("This takes %s minutes to run" %run_time)

In [None]:
outputNYT = outputNYT.reset_index(drop=True)
outputNYT

In [None]:
dates = np.unique(dataWSJ['date']).tolist()
len(dates)

In [None]:
inputs = []
for dt in dates:
    data = dataWSJ.loc[dataWSJ.date==dt]
    inputs.append((dt,list(data['title']),list(data['content']),list(data['cleaned_text']),list(data['parsedtexts']),
                   list(data['polarity']),list(data['subjectivity'])))
len(inputs)

In [None]:
iterspernode = 1
ndates = chunks(inputs,max(int(len(inputs)/(pnodes*iterspernode)),1))

In [None]:
'each node gets {} dates'.format(len(ndates[0]))

In [None]:
outputWSJ = pd.DataFrame()
start_time = time.time()
outputWSJ = outputWSJ.append(_pattern_title_.map(ndates))
run_time = (time.time()-start_time)/60
print ("This takes %s minutes to run" %run_time)

In [None]:
outputWSJ = outputWSJ.reset_index(drop=True)
outputWSJ

#### Plot sentiment of title and content

In [None]:
dfNYT = outputNYT[['date','contentpolarity','titlepolarity']].copy()
dfNYT['date'] = pd.to_datetime(dfNYT['date'])
dfWSJ = outputWSJ[['date','contentpolarity','titlepolarity']].copy()
dfWSJ['date'] = pd.to_datetime(dfWSJ['date'])

In [None]:
dfgroupNYT = dfNYT.groupby(pd.Grouper(key='date',freq='M')).mean()
dfgroupWSJ = dfWSJ.groupby(pd.Grouper(key='date',freq='M')).mean()

In [None]:
plt.figure(figsize=(20,10))
plt.plot(dfgroupNYT)
plt.axvline(x='2011-03-31',color='r',linestyle='--')
plt.legend(('Content Polarity','Title Polarity'),loc=0,fontsize='16')
plt.title('Polarity of NY Times title and content by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Polarity', fontsize=16)
plt.savefig('/mnt/data/TextAnalysis-Figures/PolarityTitleContentNYT.png',bbox_inches='tight')

plt.figure(figsize=(20,10))
plt.plot(dfgroupWSJ)
plt.legend(('Content Polarity','Title Polarity'),loc=0,fontsize='16')
plt.title('Polarity of Wall Street Journal title and content by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Polarity', fontsize=16)
plt.savefig('/mnt/data/TextAnalysis-Figures/PolarityTitleContentWSJ.png',bbox_inches='tight')

In [None]:
dfNYT = outputNYT[['date','contentsubjectivity','titlesubjectivity']].copy()
dfNYT['date'] = pd.to_datetime(dfNYT['date'])
dfWSJ = outputWSJ[['date','contentsubjectivity','titlesubjectivity']].copy()
dfWSJ['date'] = pd.to_datetime(dfWSJ['date'])

In [None]:
dfgroupNYT = dfNYT.groupby(pd.Grouper(key='date',freq='M')).mean()
dfgroupWSJ = dfWSJ.groupby(pd.Grouper(key='date',freq='M')).mean()

In [None]:
plt.figure(figsize=(20,10))
plt.plot(dfgroupNYT)
plt.axvline(x='2011-03-31',color='r',linestyle='--')
plt.legend(('Content Polarity','Title Polarity'),loc=0,fontsize='16')
plt.title('Subjectivity of NY Times title and content by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Polarity', fontsize=16)
plt.savefig('/mnt/data/TextAnalysis-Figures/SubjectivityTitleContentNYT.png',bbox_inches='tight')

plt.figure(figsize=(20,10))
plt.plot(dfgroupWSJ)
plt.legend(('Content Polarity','Title Polarity'),loc=0,fontsize='16')
plt.title('Subjectivity of Wall Street Journal title and content by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Polarity', fontsize=16)
plt.savefig('/mnt/data/TextAnalysis-Figures/SubjectivityTitleContentWSJ.png',bbox_inches='tight')

In [None]:
dfNYT = outputNYT[['date','contentpolarity','titlepolarity']].copy()
dfNYT['date'] = pd.to_datetime(dfNYT['date'])
dfWSJ = outputWSJ[['date','contentpolarity','titlepolarity']].copy()
dfWSJ['date'] = pd.to_datetime(dfWSJ['date'])

In [None]:
poldifNYT = pd.DataFrame()
poldifNYT['date'] = dfNYT['date']
poldifNYT['polaritydifference'] = dfNYT['contentpolarity']-dfNYT['titlepolarity']

In [None]:
poldifgroupNYT = poldifNYT.groupby(pd.Grouper(key='date',freq='M')).mean()

In [None]:
plt.figure
# plt.figure(figsize=(20,10))
plt.plot(poldifgroupNYT)
plt.axvline(x='2011-03-31',color='r',linestyle='--')
plt.title('Polarity difference of NY Times title and content by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Polarity', fontsize=16)

In [None]:
poldifWSJ = pd.DataFrame()
poldifWSJ['date'] = dfWSJ['date']
poldifWSJ['polaritydifference'] = dfWSJ['contentpolarity']-dfWSJ['titlepolarity']

In [None]:
poldifgroupWSJ = poldifWSJ.groupby(pd.Grouper(key='date',freq='M')).mean()

In [None]:
plt.figure
# plt.figure(figsize=(20,10))
plt.plot(poldifgroupWSJ)
plt.title('Polarity difference of WSJ title and content by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Polarity', fontsize=16)

In [None]:
dfNYT = outputNYT[['date','contentsubjectivity','titlesubjectivity']].copy()
dfNYT['date'] = pd.to_datetime(dfNYT['date'])
dfWSJ = outputWSJ[['date','contentsubjectivity','titlesubjectivity']].copy()
dfWSJ['date'] = pd.to_datetime(dfWSJ['date'])

In [None]:
subdifNYT = pd.DataFrame()
subdifNYT['date'] = dfNYT['date']
subdifNYT['subjectivitydifference'] = dfNYT['contentsubjectivity']-dfNYT['titlesubjectivity']

In [None]:
subdifgroupNYT = subdifNYT.groupby(pd.Grouper(key='date',freq='M')).mean()

In [None]:
plt.figure
# plt.figure(figsize=(20,10))
plt.plot(subdifgroupNYT)
plt.axvline(x='2011-03-31',color='r',linestyle='--')
plt.title('Subjectivity difference of NY Times title and content by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Subjectivity', fontsize=16)

In [None]:
subdifWSJ = pd.DataFrame()
subdifWSJ['date'] = dfWSJ['date']
subdifWSJ['subjectivitydifference'] = dfWSJ['contentsubjectivity']-dfWSJ['titlesubjectivity']

In [None]:
subdifgroupWSJ = subdifWSJ.groupby(pd.Grouper(key='date',freq='M')).mean()

In [None]:
plt.figure
# plt.figure(figsize=(20,10))
plt.plot(subdifgroupWSJ)
plt.title('Subjectivity difference of WSJ title and content by month',fontsize=20)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Subjectivity', fontsize=16)

## Match labels of other newspapers with labels of NYT using LDA topics
Method 1: Support Vector Machines (Scikit-learn package)

Method 2: Neural Network (TensorFlow)

In [None]:
import pandas as pd, numpy as np, os, datetime, time, re, matplotlib
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
### Read NY Times
dfNYT = pd.read_csv('/mnt/data/TextAnalysis/NYT.txt',encoding='utf-8',sep='|',index_col=0,chunksize=10000)
dfNYT = pd.concat(dfNYT)
### Read Wall Street Journal
dfWSJ = pd.read_csv('/mnt/data/TextAnalysis/WSJ.txt',encoding='utf-8',sep='|',index_col=0)

In [None]:
### Remove Wall Street Journal NaN rows and duplicates
dfWSJ.dropna(subset=['Cleaned_text'],inplace=True)
dfWSJ.drop_duplicates(subset=['Cleaned_text'],keep='first',inplace=True)
dfWSJ.reset_index(drop=True,inplace=True)

In [None]:
import sys, os, codecs
scriptpath = '/home/ubuntu/Codes/Text-Analytics-Module-master/Code'
sys.path.append(scriptpath)
import basic_text_processing_functions as tx
from basic_text_processing_functions import *

In [None]:
import spacy
nlp = spacy.load('en')

In [None]:
pathloc = '/mnt/data/TextAnalysis/'

In [None]:
pnodes = 16
if 1==1:
    with codecs.open(pathloc+'default.cfg','w',encoding='utf-8') as f:
        f.write(json.dumps({'batch_size':1000,'n_threads':pnodes,'fpathroot':pathloc,'fpathappend':u'','entity_sub':True}))
    batch_size,n_threads,fpathroot,fpathappend,entity_sub,numtopics = tx._config_text_analysis_(pathloc+'default.cfg')
else:
    batch_size,n_threads,fpathroot,fpathappend,entity_sub,numtopics = tx._config_text_analysis_(pathloc+'default.cfg')

In [None]:
tx.batch_size,tx.n_threads,tx.fpathroot,tx.fpathappend,tx.entity_sub,tx.numtopics = batch_size,n_threads,fpathroot,fpathappend,entity_sub,numtopics

In [None]:
tx.fpathroot, fpathroot, pathloc, fpathappend

In [None]:
tx.n_threads

In [None]:
from gensim import corpora
### Create Dictionary
if 1==0:
    vocab,gensim_dictionary,cts = tx._make_dict_(grammed_texts,floc=fpathroot+fpathappend+'dict_gram.dict',
                                                 topfilter=99,bottomfilter=25,no_filters=False,keep_ent=False,
                                                 discard_list=discard_list,keep_list={})
    print(len(vocab))
else:
    gensim_dictionary = corpora.Dictionary.load(tx.fpathroot+'dict_gram.dict')
    print(len(gensim_dictionary))

In [None]:
if 1==0:
    grammed_corpus = tx._serialize_corpus_(grammed_texts,gensim_dictionary,outfpath=fpathroot+fpathappend+'_serialized.mm')
else:
    grammed_corpus_loc = tx.fpathroot+tx.fpathappend+'_serialized.mm'
    grammed_corpus = MmCorpus(grammed_corpus_loc)

In [None]:
### Perform LDA
numtopics = 100
ldafile = 'lda_'+str(numtopics)
if 1==0:
    lda = tx._lda_(gensim_dictionary,corpus_path=grammed_corpus,numtopics=numtopics,iterations=100) # defaults to 10 topics
    lda.save(pathloc+ldafile)
else: 
    lda = LdaMulticore.load(pathloc+ldafile)
lda.minimum_probability = 0.0

In [None]:
### Create Visualization of Topics
import pyLDAvis.gensim as ldavis
if 1==0:
    ldaviz = ldavis.prepare(lda,grammed_corpus,gensim_dictionary)
    pyLDAvis.save_html(ldaviz,pathloc+'viz_'+ldafile+'.html')
lda.show_topics()

### Support vector machines vs logit vs regularized least squares

In [None]:
topicsNYTbinary = pd.read_csv('/mnt/data/TextAnalysis/topicsNYTbinary.txt',encoding='utf-8',sep='|',index_col=0,chunksize=10000)
topicsNYTbinary = pd.concat(topicsNYTbinary)
topicsWSJbinary = pd.read_csv('/mnt/data/TextAnalysis/topicsWSJbinary.txt',encoding='utf-8',sep='|',index_col=0)

In [None]:
dataNYT = dfNYT.copy()

In [None]:
### Create label_1 from subsection_name, subsection_name, and news_desk
NYT_Label_1 = []
document_type = list(dataNYT['document_type'])
type_of_material = list(dataNYT['type_of_material'])
news_desk = list(dataNYT['news_desk'])
section_name = list(dataNYT['section_name'])
subsection_name = list(dataNYT['subsection_name'])
dataNYTsub = dataNYT[pd.isnull(dataNYT['subsection_name'])]
sectionlist = sorted(dataNYTsub['section_name'].value_counts().index.tolist())
subsectionlist = sorted(dataNYT['subsection_name'].value_counts().index.tolist())
overlaplist = [a for a in sectionlist if a in subsectionlist]
differentlist = [a for a in sectionlist if a not in subsectionlist]
for ind,val in enumerate(subsection_name):
    if type(val)==float:
        if type(section_name[ind])==float:
            if type(news_desk[ind])==float:
                item = val
            elif news_desk[ind] in [u'Business',u'Culture',u'Escapes',u'Foreign',u'Great Homes and Destinations',
                                    u'OpEd',u'Science',u'Sunday Review',u'Styles',u'Travel',u'TStyle',u'U.S.',
                                    u'Your Money']:
                item = news_desk[ind]
            elif news_desk[ind].startswith(u'Crosswords'):
                item = u'Crosswords & Games'
            elif news_desk[ind]==u'N.Y. / Region':
                item = u'New York and Region'
            elif news_desk[ind]==u'U.S. / 9/11 Anniversary':
                item = u'9/11 Anniversary'
            elif news_desk[ind]==u'Your Money / Mortgages':
                item = u'Mortgages'
            else:
                item = val
        elif section_name[ind] in overlaplist:
            item = section_name[ind]
            
        elif section_name[ind] in differentlist:
            if section_name[ind].startswith(u'Admin'):
                if document_type[ind]==u'article':
                    item = u'Corrections'
                else:
                    item = u'Multimedia'
            if section_name[ind].startswith(u'Arts'):
                item = u'Arts'
            elif section_name[ind].startswith(u'Auto'):
                item = u'Automobiles'
            elif section_name[ind].startswith(u'Books'):
                item = u'Books'
            elif section_name[ind].startswith(u'Business'):
                item = u'Business'
            elif section_name[ind].startswith(u'Crosswords'):
                item = u'Crosswords & Games'
            elif section_name[ind].startswith(u'Dining'):
                item = u'Dining & Wine'
            elif section_name[ind] in (u'Editorials',u'Public Editor'):
                item = u'Editorials'
            elif section_name[ind].startswith(u'Education'):
                item = u'Education'
            elif section_name[ind].startswith(u'Front Page'):
                text = section_name[ind][section_name[ind].find(';')+2:]
                if text.find(';')==-1:
                    item = text
                else:
                    item = text[:text.find(';')]
            elif section_name[ind].startswith(u'Great Homes'):
                item = u'Great Homes and Destinations'
            elif section_name[ind].startswith(u'Health'):
                item = u'Health'
            elif section_name[ind] in [u'Home & Garden',u'Home and Garden; Style']:
                item = u'Home & Garden'
            elif section_name[ind] in [u'Learning',u'The Learning Network']:
                item = u'Learning'
            elif section_name[ind].startswith(u'Magazine'):
                item = u'Magazine'
            elif section_name[ind].startswith(u'Movies'):
                item = u'Movies'
            elif section_name[ind].startswith(u'Multimedia'):
                item = u'Multimedia'
            elif section_name[ind] in [u'N.Y. / Region',u'New York',u'New York and Region']:
                item = u'New York and Region'
            elif section_name[ind].startswith(u'Obituaries'):
                item = u'Obituaries'
            elif section_name[ind].startswith(u'Opinion'):
                item = u'Opinion'
            elif section_name[ind].startswith(u'Science'):
                item = u'Science'
            elif section_name[ind].startswith(u'Sports'):
                item = u'Sports'
            elif section_name[ind].startswith(u'Style') or section_name[ind] in [u'Booming',u'National',u'T:Style']:
                item = u'Style'
            elif section_name[ind].startswith(u'Technology'):
                item = u'Technology'
            elif section_name[ind].startswith(u'Theater'):
                item = u'Theater'
            elif section_name[ind].startswith(u'Today\u2019s Paper'):
                item = u"Today's Paper"
            elif section_name[ind].startswith(u'Travel'):
                item = u'Travel'
            elif section_name[ind] in [u'U.S.',u'U.S.; Obituaries',u'U.S.; Washington']:
                item = u'U.S.'
            elif section_name[ind].startswith(u'Washington'):
                item = u'Washington'
            elif section_name[ind].startswith(u'Week'):
                item = u'Week in Review'
            elif section_name[ind].startswith(u'World'):
                item = u'World'
            else:
                item = section_name[ind]
        else:
            item = val
    elif val==u'false':
        if news_desk[ind]==u'National':
            item = u'Politics'
        elif type(news_desk[ind])!=float:
            item = news_desk[ind]
        else:
            item = val
    else:
        item = val
    if item==u'Art':
        label = u'Arts'
    elif item==u'Dealbook':
        label = u'DealBook'
    elif item==u'Dining &amp; Wine':
        label = u'Dining & Wine'
    elif item==u'Fashion &amp; Style':
        label = u'Fashion & Style'
    elif item==u'Media &amp; Advertising':
        label = u'Media & Advertising'
    elif item==u'Men\u2019s Style':
        label = u"Men's Style"
    elif item==u'Money &amp; Policy':
        label = u'Money & Policy'
    elif item==u'Weddings / Celebrations':
        label = u'Weddings/Celebrations'
    else:
        label = item
    NYT_Label_1.append(label)

In [None]:
dataNYT['label_1'] = NYT_Label_1

In [None]:
### Group label_1 to get label_2
NYT_Label_2 = []
for ind,val in enumerate(NYT_Label_1):
    if val in [u"401(k)'s and Similar Plans",u'Estate Planning',u"Individual Retirement Accounts (IRA's)",
               u'Life and Disability Insurance',u'Retirement']:
        NYT_Label_2.append(u'Retirement Plans')
    elif val in [u'Annuities',u'Asset Allocation',u'Brokerage and Bank Accounts',u'Credit Scores',
                 u'Credit and Debit Cards',u'Financial Planners',u'Household Budgeting',u'Mutual Funds',
                 u'Mutual Funds and ETFs',u'Paying for College',u'Stocks and Bonds',u'Your Money']:
        NYT_Label_2.append(u'Finance, Banking, and Investment')
    elif val in [u'Arts',u'Art & Design',u'Dance',u'Design',u'Design & Interiors',u'International Arts',u'Theater',
                 u'Theater Reviews',u'Tony Awards']:
        NYT_Label_2.append(u'Art and Design')
    elif val in [u'Africa',u'Americas',u'Asia Pacific',u'Australia',u'Canada',u'Europe',u'Foreign',
                 u'International Home',u'International Opinion',u'Middle East',u'Reach of War',u'What in the World',
                 u'World']:
        NYT_Label_2.append(u'World News')
    elif val in [u'Auto Insurance',u'Auto Loans',u'Automobiles',u'Collectible Cars',u'New Cars',u'Wheels']:
        NYT_Label_2.append(u'Automobiles')
    elif val in [u'Auto Racing',u'Baseball',u'College Basketball',u'College Football',u'Cricket',u'Cycling',
                 u'Global Sports',u'Golf',u'Hockey',u'Horse Racing',u'More Sports',u'N.B.A.',u'Olympics',
                 u'Olympics 2010',u'Pro Basketball',u'Pro Football',u'Rugby',u'Sailing',u'Skiing',u'Soccer',u'Sports',
                 u'Tennis',u'World Cup',u'International Sports']:
        NYT_Label_2.append(u'Sports')
    elif val in [u'Awards Season',u'DVD',u'Entertainment',u'Movies',u'Music',u'Television']:
        NYT_Label_2.append(u'Music, Movies, and Entertainment')
    elif val in [u'Beauty',u'Couture Runway',u'Culture',u'Fashion & Beauty',u'Fashion & Style',u'Fashion Shows',
                 u'International Style',u"Men's Fashion",u"Men's Style",u'Style',u'T Magazine',u'Trends',u'Weddings',
                 u'Weddings/Celebrations',u"Women's Fashion"]:
        NYT_Label_2.append(u'Beauty, Fashion, and Style')
    elif val in [u'Books',u'Book Review',u'First Chapters',u'Sunday Book Review']:
        NYT_Label_2.append(u'Books & Book Review')
    elif val in [u'Bridge',u'Chess']:
        NYT_Label_2.append(u'Crosswords & Games')
    elif val in [u'Business',u'Business Computing',u'Companies',u'DealBook',u'Entrepreneurship',u'Identify Theft',
                 u'International Business',u'Internet',u'Job Market',u'Media',u'Media & Advertising',u'Personal Tech',
                 u'Personal Tech Extra',u'Small Business',u'Small Business Email',u'Start-Ups',
                 u"Stuart Elliott's In Advertising",u'Technology']:
        NYT_Label_2.append(u'Business and Technology')
    elif val in [u'Campaign Stops',u'Columnists',u'Contributors',u'none',u'Opinion Today Email',u'Politics']:
        NYT_Label_2.append(u'Politics')
    elif val in [u'Commercial',u'Commercial Real Estate',u'Communities',u'Home Insurance',u'Key Magazine',u'Mortgages',
                 u'Real Estate',u'manhattan']:
        NYT_Label_2.append(u'Real Estate')
    elif val in [u'Connecticut',u'Long Island',u'New Jersey',u'New York and Region',u'The City',u'Westchester']:
        NYT_Label_2.append(u'Regional News')
    elif val in [u'Dining & Wine',u'Eat',u'Food',u'Wine, Beer & Cocktails']:
        NYT_Label_2.append(u'Food, Cooking, and Dining')
    elif val in [u'Economy']:
        NYT_Label_2.append(u'Economy')
    elif val in [u'Education',u'Education Life',u'Lesson Plans',u'Student Loans']:
        NYT_Label_2.append(u'Education')
    elif val in [u'Election 2016',u'Elections']:
        NYT_Label_2.append(u'Elections')
    elif val in [u'Energy & Environment ',u'Environment',u'Science', u'Space & Cosmos']:
        NYT_Label_2.append(u'Science and Environment')
    elif val in [u'Escapes',u'Travel']:
        NYT_Label_2.append(u'Travel')
    elif val in [u'Family',u'Fitness & Nutrition',u'Health',u'Health Insurance',u'Live',u'Mind',u'Money & Policy',
                 u'Move',u'Research',u'Views']:
        NYT_Label_2.append(u'Health and Wellness')
    elif val in [u'Great Homes and Destinations',u'Great Homes and Destinations Multimedia']:
        NYT_Label_2.append(u'Great Homes and Destinations')
    else:
        NYT_Label_2.append(val)

In [None]:
dataNYT['label_2'] = NYT_Label_2

In [None]:
### Create new dataframe from original dataframe and topic probabilities
columns1 = ['date','title','headline','author','content','new_text','document_type','type_of_material','news_desk',
            'keywords','section_name','subsection_name','print_page','word_count','url','label_1','label_2']
columns2 = [topicsNYTbinary.columns.tolist()[2:4],topicsNYTbinary.columns.tolist()[104:105],
            topicsNYTbinary.columns.tolist()[4:104],topicsNYTbinary.columns.tolist()[105:]]
columns2 = [a for sublist in columns2 for a in sublist]
datNYT1 = dataNYT[columns1].copy()
datNYT2 = topicsNYTbinary[columns2].copy()
datNYT = pd.concat([datNYT1,datNYT2], axis=1)

In [None]:
### Get dataframe that has labels
labeledNYT = datNYT[pd.notnull(datNYT['label_2'])]
labeledNYT.head()

### Predict labels from topics using SVM

In [None]:
import sklearn
from sklearn import svm, preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, MinMaxScaler
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [None]:
### Get topics
X = labeledNYT.iloc[:,20:120]
X = X.as_matrix()

In [None]:
### Try different data scalers
Xstandard = StandardScaler().fit_transform(X)
Xrobust = RobustScaler().fit_transform(X)
Xnormalize = Normalizer().fit_transform(X)
Xminmax = MinMaxScaler().fit_transform(X)
Xquantile1 = QuantileTransformer(output_distribution='uniform').fit_transform(X)
Xquantile2 = QuantileTransformer(output_distribution='normal').fit_transform(X)
Xpower1 = PowerTransformer(method='yeo-johnson').fit_transform(X)
Xpower2 = PowerTransformer(method='box-cox').fit_transform(X)

In [None]:
print type(X),type(Xstandard),type(Xrobust),type(Xnormalize),type(Xminmax),type(Xquantile1),type(Xquantile2),type(Xpower1),type(Xpower2)

In [None]:
print X.shape, Xstandard.shape, Xrobust.shape, Xnormalize.shape, Xminmax.shape, Xquantile1.shape, Xquantile2.shape, Xpower1.shape, Xpower2.shape 

In [None]:
### Get labels
y = labeledNYT['label_2']

In [None]:
### Transform the text class labels into numerical labels (multiclass classification)
le = LabelEncoder()
y_numeric = le.fit_transform(y)

In [None]:
print type(y_numeric), len(y_numeric)

In [None]:
### Transform the text class labels into numerical labels (multilabel classification)
mlb = MultiLabelBinarizer()
y_indicator = mlb.fit_transform(y[:,None])

In [None]:
print type(y_indicator), y_indicator.shape

In [None]:
# le.classes_
mlb.classes_

In [None]:
"""
Inputs for SVM classifications:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovo', degree=3, gamma='scale',
    kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
          max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0)
C                           # 1 by default. Penalty parameter of the error term. Decrease if having many noisy observations. Trades off misclassification of training examples against simplicity of the decision surface. Low C makes decision surface smooth. High C aims at classifying all training examples correctly 
cache_size                  # Specify the size of the kernel cache (in MB). 200(MB) by default. Size of kernel cache has strong impact on run times for larger problems. If having enough RAM, set cache_size higher, such as 500(MB) or 1000(MB) 
class_weight                # Give more importance to certain classes. If data for classification are unbalanced (e.g. many positive and few negative), set class_weight='balanced' and/or try different penalty parameters C 
coef0                       # Independent term in kernel function for poly and sigmoid kernels 
decision_function_shape     # {'ovr','ovo'} OneVsRest and OneVsOne. Allows to aggregate results of “one-against-one” classifiers to a decision function of shape (n_samples, n_classes) to provide a consistent interface with other classifiers 
degree                      # Parameter for polynomial kernel 
gamma                       # {'auto','auto_deprecated','scale',float} for rbf, poly, and sigmoid kernels. Defines how much influence a single training example has (=Inverse of radius of influence of samples selected by the model as support vectors). The larger gamma is, the closer other examples must be to be affected. Should not be too large or too small 
kernel                      # {'linear','rbf','poly','sigmoid','precomputed'}
max_iter                    # {-1,some natural number} 
probability                 # Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method. Use a random number generator to shuffle the data for probability estimation (=True). Estimators are not random (=False) 
random_state                # {int,RandomState instance,None}. Control the randomness when probability=True. No effect when probability=False 
shrinking                   # Whether to use the shrinking heuristic 
tol                         # Tolerance for stopping criterion. Try smaller tolerance to get same results for the same input data when set dual=True under LinearSVC 
verbose                     # {0,False}
dual                        # Use random number generator to select features when fitting the model with a dual coordinate descent (=True). This randomness can also be controlled with the random_state parameter 
fit_intercept               # 
intercept_scaling           # 
loss                        # 
multi_class                 # 
penalty                     # 
"""

In [None]:
"""
y_numeric is faster, LinearSVC>SVC(kernel='linear')>SVC(kernel='rbf'), cache_size=10000 is much faster.
'rbf' and y_numeric increase accuracy.
"""

In [None]:
### Check if out-of-sample data has the same topics as in-sample data
print len(set(y_numeric[:10000])), len(set(y_numeric[:11000]))

In [None]:
### Set n_jobs=-1 to use all processors, n_jobs=None to use 1 processor, n_jobs=int to use some processors
InSample1 = [X[:10000],Xstandard[:10000],Xrobust[:10000],Xnormalize[:10000],Xminmax[:10000],Xquantile1[:10000],
           Xquantile2[:10000],Xpower1[:10000],Xpower2[:10000]]
OutSample1 = [X[10000:11000],Xstandard[10000:11000],Xrobust[10000:11000],Xnormalize[10000:11000],Xminmax[10000:11000],
           Xquantile1[10000:11000],Xquantile2[10000:11000],Xpower1[10000:11000],Xpower2[10000:11000]]
InLabels1 = y_numeric[:10000]
OutLabels1 = y_numeric[10000:11000]
clf1 = OneVsRestClassifier(svm.LinearSVC(C=1,dual=False,max_iter=10000),n_jobs=-1)
clf2 = OneVsRestClassifier(svm.LinearSVC(C=10,dual=False,max_iter=10000),n_jobs=-1)
clf3 = OneVsRestClassifier(svm.SVC(C=1,cache_size=10000,gamma='scale',max_iter=-1,tol=0.0001),n_jobs=-1)
clf4 = OneVsRestClassifier(svm.SVC(C=10,cache_size=10000,gamma='scale',max_iter=-1,tol=0.0001),n_jobs=-1)
clfs1 = [clf1,clf2,clf3,clf4]

In [None]:
count1 = 0
Models1 = []
InSampleAccuracy1 = []
OutSampleAccuracy1 = []
runtime1 = []
for clf in clfs1:
    for indx,sample in enumerate(InSample1):
        count1 += 1
        start_time = time.time()
        model = clf.fit(sample,InLabels1)
        Models1.append(model)
        accuracy1 = clf.score(sample,InLabels1)
        InSampleAccuracy1.append(accuracy1)
        accuracy2 = clf.score(OutSample1[indx],OutLabels1)
        OutSampleAccuracy1.append(accuracy2)
        print 'Model ',count1,': ',model
        print 'In-sample accuracy = ',accuracy1
        print 'Out-of-sample accuracy = ',accuracy2
        run_time = (time.time()-start_time)/60
        runtime1.append(run_time)
        print ("This takes %s minutes to run" %run_time)
        print '--------------------------------------------------'
### X,Xstandard,Xrobust,Xnormalize,Xminmax,Xquantile1,Xquantile2,Xpower1,Xpower2

In [None]:
Samplelist1 = ['X[:100000]','Xstandard[:100000]','Xrobust[:100000]','Xnormalize[:100000]','Xminmax[:100000]',
               'Xquantile1[:100000]','Xquantile2[:100000]','Xpower1[:100000]','Xpower2[:100000]']*4
Samplelist1

In [None]:
SVCtests1 = pd.DataFrame([Models1,InSampleAccuracy1,OutSampleAccuracy1,runtime1,Samplelist1]).T
SVCtests1.columns = ['Model','InSampleAccuracy','OutSampleAccuracy','RunTime','Sample']
SVCtests1

### Neural network, Tensorflow

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
### Get topics
X = labeledNYT.iloc[:,20:120]
# X = labeledNYT.iloc[:,120:220]
X = X.as_matrix()

In [None]:
print len(X[1]), len(X[2]), X.shape

In [None]:
### Get labels
y = labeledNYT['label_2']

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_numeric = le.fit_transform(y)
print type(y_numeric), len(y_numeric)

In [None]:
print len(set(y_numeric[:14000])), len(set(y_numeric[:28000]))

In [None]:
### Convert data to tensors before feeding into the neural network
train_data = keras.preprocessing.sequence.pad_sequences(X[:14000],
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=100)
test_data = keras.preprocessing.sequence.pad_sequences(X[14000:28000],
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=100)

In [None]:
### Create a validation set from the train data
x_val = X[10000:14000]
partial_x_train = X[:10000]

y_val = y_numeric[10000:14000]
partial_y_train = y_numeric[:10000]

In [None]:
### Input shape is the vocabulary count used for the movie reviews (12,9638 words)
vocab_size = 129638

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size,16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16,activation=tf.nn.relu))
model.add(keras.layers.Dense(1,activation=tf.nn.sigmoid))

model.summary()

In [None]:
### Compile model
model.compile(optimizer=tf.train.AdamOptimizer(),loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
### Fit the model
history = model.fit(partial_x_train,partial_y_train,epochs=40,batch_size=512,validation_data=(x_val,y_val),verbose=1)

In [None]:
### Evaluate the model
results = model.evaluate(X[14000:28000],y_numeric[14000:28000])
print results

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()     # Clear figure
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()