Produces tables with bootstrap percentages and EN coefficients in them.

In [1]:
# Initial imports

import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV
from sklearn import preprocessing
from matplotlib import pyplot as plt
plt.style.use('seaborn-darkgrid')
palette = plt.get_cmap('Dark2')
import os

In [90]:
def AbsoluteBootstrapTable(perc, topicdesc, topicno = 15, entirebool=False):
    '''Creates a table that returns the topics
    and words of the most selected topics in the bootstrap procedure.
    Returns the percentage of times they were chosen. Does not consider
    the sign of the coefficients, only that they were non-zero
    
    Inputs:
        perc: numpy array of percentages of times coefficients were chosen
        topicdesc: panda DataFrame object describing the topics in usual form,
                   NB when importing fromm csv add .transpose() to end
        topicno: integer determining how many topics do you want the DF to have
        entirebool: set to true if doing analysis on all 180 topics together
    Outputs:
        Pandas Dataframe object
    '''
    topicdesc.columns = topicdesc.iloc[0] # remove title row
    topicdesc = topicdesc.reindex(topicdesc.index.drop(0))
    topicdesc_level = topicdesc.iloc[:, ::2].copy() # get rid of probabilities of words (they are in order)
    topicdesc_change = topicdesc.iloc[:, ::2].copy() # create new copy for the changes in words
    topicdesc_change = topicdesc_change.add_suffix('_chg')
    if entirebool==True:
        topicdesc_change_ir = topicdesc_change.add_suffix('_ir')
        topicdesc_change_qa = topicdesc_change.add_suffix('_qa')
        topicdesc_change_stat = topicdesc_change.add_suffix('_stat')
        topicdesc_level_ir = topicdesc_level.add_suffix('_ir')
        topicdesc_level_qa = topicdesc_level.add_suffix('_qa')
        topicdesc_level_stat = topicdesc_level.add_suffix('_stat')        
        topicdf = pd.concat([topicdesc_change_ir, topicdesc_level_ir, topicdesc_change_qa, topicdesc_level_qa, topicdesc_change_stat, topicdesc_level_stat], axis=1)
    else:    
        topicdf = pd.concat([topicdesc_change, topicdesc_level], axis=1) # concatenate the two dataframes
    topicdf.loc[0, :] = perc.copy() # put percentages in row with row index 0
    topicdf = topicdf.sort_index() # sort so that percentages are at the top
    topicdf.rename(index={0:'Percs'}, inplace=True)
    
    topicdf.sort_values(by='Percs', axis=1, ascending=False, inplace=True) # sort the table so the most selected ones are at the front
    
    dftoexport = topicdf.T # transpose
    dftoexport['Stems'] = dftoexport[list(range(1, 10))].apply(lambda x: '. '.join(x), axis=1) # aggregate up
    dftoexport = dftoexport.loc[:,['Percs', 'Stems']].copy()
    dftoexport.columns = ['Bootstrap %', 'Stems']
    dftoexport = dftoexport.iloc[0:topicno, :].copy()
    dftoexport.rename_axis('Topic', inplace=True)
    
    return dftoexport

In [91]:
def ENTable(ENcoeffs, topicdesc, entirebool=False):
    '''Creates a table that returns the non-zero coefficient values in an elastic
    net regression and the topic descriptions
    
    Inputs:
        Encoeffs: numpy array of the elastic net coefficient matrix
        topicdesc: panda DataFrame object describing the topics in usual form,
                   NB when importing fromm csv add .transpose() to end
    Outputs:
        Pandas dataframe object
    '''
    topicdesc.columns = topicdesc.iloc[0] # remove title row
    topicdesc = topicdesc.reindex(topicdesc.index.drop(0))
    topicdesc_level = topicdesc.iloc[:, ::2].copy() # get rid of probabilities of words (they are in order)
    topicdesc_change = topicdesc.iloc[:, ::2].copy() # create new copy for the changes in words
    topicdesc_change = topicdesc_change.add_suffix('_chg')
    if entirebool==True:
        topicdesc_change_ir = topicdesc_change.add_suffix('_ir')
        topicdesc_change_qa = topicdesc_change.add_suffix('_qa')
        topicdesc_change_stat = topicdesc_change.add_suffix('_stat')
        topicdesc_level_ir = topicdesc_level.add_suffix('_ir')
        topicdesc_level_qa = topicdesc_level.add_suffix('_qa')
        topicdesc_level_stat = topicdesc_level.add_suffix('_stat')        
        topicdf = pd.concat([topicdesc_change_ir, topicdesc_level_ir, topicdesc_change_qa, topicdesc_level_qa, topicdesc_change_stat, topicdesc_level_stat], axis=1)
    else:    
        topicdf = pd.concat([topicdesc_change, topicdesc_level], axis=1) # concatenate the two dataframes
    
    ENtopicdf = topicdf.copy()
    ENtopicdf.loc[0, :] = ENcoeffs.copy()
    ENtopicdf.rename(index={0:'Coeffs'}, inplace=True)
    ENsortedtopicdf = ENtopicdf.sort_values(by='Coeffs', axis=1, ascending=False)
    ENsortedtopicdf = ENsortedtopicdf.loc[:, (ENsortedtopicdf.loc['Coeffs', :] != 0)]
    
    ENdftoexport = ENsortedtopicdf.T
    ENdftoexport['Stems'] = ENdftoexport[list(range(1, 10))].apply(lambda x: '. '.join(x), axis=1) # aggregate up
    ENdftoexport = ENdftoexport.loc[:,['Coeffs', 'Stems']].copy()
    ENdftoexport.columns = ['Coefficient', 'Stems']
    ENdftoexport.rename_axis('Topic', inplace=True)
    ENdftoexport['Coefficient'] = ENdftoexport['Coefficient'].apply(lambda x: round(x, 2)) #round to 2 dp
    
    return ENdftoexport

In [82]:
def AbsoluteBootstrapTableENAug(perc, topicdesc, ENcoeffs, topicno = 15, entirebool=False):
    '''
    Creates a bootstrap table, but adds a column which puts in the
    coefficient from the EN table as well to get a sense of direction.'''
    
    # Get both tables
    Btable = AbsoluteBootstrapTable(perc, topicdesc, topicno, entirebool=entirebool)
    ENtable = ENTable(ENcoeffs, topicdesc, entirebool=entirebool)
    
    concattable = pd.concat([Btable, ENtable['Coefficient']], axis=1, sort=False)
    concattable = concattable.iloc[0:topicno, :]
    
    return concattable

In [83]:
def TableLatex(table, name, path):
    '''Convert pandas dataframe to latex and save it somewhere
    
    Inputs:
        Table: pandas dataframe object you want to save as a latex table
        Path: Where you want to save the latex table
        Name: Name of table which it will be saved as
    Outputs:
        None.
    '''
    
    pd.set_option('display.max_colwidth', -1) # make pandas return full table
    os.chdir(path)
    with open((str(name)+'.tex'),'w') as tf: # write it to a latex file 
        tf.write(table.to_latex(column_format = '|l|c|l|c|',
                                 bold_rows=True))
    
    return None

In [84]:
# We want to return these for all three mediums and 2 moments (stdev and skew, kurtosis looks iffy so bin it)

In [85]:
# Import the data

# Topic description
mytopicdesc = pd.read_csv('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/1 - LDA/Together/k_30/alltext_topic_desc.csv', header=None).transpose()

# Bootstrap data
stdevirpercs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/2 - Bootstrap/Together/k_30/QuerySeparate/StDevresid_irpercs.npy')
stdevqapercs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/2 - Bootstrap/Together/k_30/QuerySeparate/StDevresid_qapercs.npy')
stdevstatpercs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/2 - Bootstrap/Together/k_30/QuerySeparate/StDevresid_statpercs.npy')

skewirpercs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/2 - Bootstrap/Together/k_30/QuerySeparate/Skewresid_irpercs.npy')
skewqapercs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/2 - Bootstrap/Together/k_30/QuerySeparate/Skewresid_qapercs.npy')
skewstatpercs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/2 - Bootstrap/Together/k_30/QuerySeparate/Skewresid_statpercs.npy')

stdevpercs_entire = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/2 - Bootstrap/Together/k_30/QuerySeparate/Entire/stdev_entirepercs.npy')
skewpercs_entire = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/2 - Bootstrap/Together/k_30/QuerySeparate/Entire/skew_entirepercs.npy')

# Elastic Net coefficients
stdevircoeffs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/1 - Permutation/Together/k_30/QuerySeparate/stdev_ircoeffs.npy')
stdevqacoeffs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/1 - Permutation/Together/k_30/QuerySeparate/stdev_qacoeffs.npy')
stdevstatcoeffs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/1 - Permutation/Together/k_30/QuerySeparate/stdev_statcoeffs.npy')

skewircoeffs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/1 - Permutation/Together/k_30/QuerySeparate/skew_ircoeffs.npy')
skewqacoeffs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/1 - Permutation/Together/k_30/QuerySeparate/skew_qacoeffs.npy')
skewstatcoeffs = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/1 - Permutation/Together/k_30/QuerySeparate/skew_statcoeffs.npy')

stdevcoeffs_entire = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/1 - Permutation/Together/k_30/QuerySeparate/Entire/stdev_entirecoeffs.npy')
skewcoeffs_entire = np.load('C:/Users/Tim/Documents/Nuffield/MphilThesis/Output/4 - VariableSelection/1 - Permutation/Together/k_30/QuerySeparate/Entire/skew_entirecoeffs.npy')


In [86]:
mypath = 'C:/Users/Tim/Documents/Nuffield/MphilThesis/DataVis/BootstrapTables/Figs/'

BootstrapDict = {'stdevir':stdevirpercs,
               'stdevqa':stdevqapercs,
               'stdevstat':stdevstatpercs,
               'skewir':skewirpercs,
               'skewqa':skewqapercs,
               'skewstat':skewstatpercs}
ENDict = {'stdevir':stdevircoeffs,
          'stdevqa':stdevqacoeffs,
          'stdevstat':stdevstatcoeffs,
          'skewir':skewircoeffs,
          'skewqa':skewqacoeffs,
          'skewstat':skewstatcoeffs}

In [75]:
#for key in BootstrapDict.keys():
#    TableLatex(AbsoluteBootstrapTable(BootstrapDict[key], mytopicdesc), (key+str('boot')), mypath)

In [76]:
#for key in ENDict.keys():
#    TableLatex(ENTable(ENDict[key], mytopicdesc), (key+str('EN')), mypath)

In [77]:
#for key in ENDict.keys():
#    TableLatex(AbsoluteBootstrapTableENAug(BootstrapDict[key], mytopicdesc, ENDict[key]), (key+str('BandEN')), mypath)

In [94]:
TableLatex(AbsoluteBootstrapTableENAug(stdevpercs_entire, mytopicdesc, stdevcoeffs_entire, entirebool=True), 'stdeventire', mypath)
TableLatex(AbsoluteBootstrapTableENAug(skewpercs_entire, mytopicdesc, skewcoeffs_entire, entirebool=True), 'skewentire', mypath)
