In [None]:
import pandas as pd
import os
import pickle
import datetime
import telegram_send
from multiprocessing import Pool, Lock

In [None]:
## for multicore machines, define how many cores should be used

In [None]:
nc=int(7)

In [None]:
## State if bigrams or trigrams are to be examined

In [None]:
ng = int(3)

## Load the stemmed business model thesaurus and the trigrams

In [None]:
## create a python dictionary containing the sets and descriptors for each set
## in the business model thesaurus

In [None]:
path2bmt='/path/to/stemmed/business/model/thesaurus'

In [None]:
df=pd.read_excel(path2bmt,index_col=None)

In [None]:
sets=dict()
for col in df:
    sets[col]=df[col].dropna().tolist()

In [None]:
## create a list of the filepaths of the files containing the
## text data scraped from the wayback machine

In [None]:
path2wbm='/path/to/waybackmachine/trigrams'

In [None]:
dflist=[os.path.join(path2wbm,df) for df in os.listdir(path2wbm) if df.startswith('df')]

## Define the function for tagging the terms in the text that fit the descriptors 

In [None]:
## here the function is defined that checks if the descriptors 
## appear in the text from the websites and then tags them if appropriate

In [None]:
def topic_distribution(cell):
    dis=[]
    if pd.isna(cell):
        return None
    for top in sets:  ## loop through the sets in the thesaurus
        c=0
        for val in sets[top]:   # loop through descriptors
            vall=val.strip().split(' ')
            
            if not len(vall) == ng: # ensure only required n-grams are condsidered
                continue
                
            for wrd in cell.split(','):
                wrdl=wrd.strip().split(' ')
                
                if not len(wrdl) == ng:
                    continue
                
                if vall[0] in wrdl[0] and vall[1] in wrdl[1] and vall[2] in wrdl[2]: # tag term if descriptor is contained within term from website text
                    c+=1
                else:
                    continue
                    
        dis.append(c)

        
    return dis

## Prepare a new dataframe to contain the results of the categorisation

In [None]:
cols=df.columns.tolist().copy()

In [None]:
cols.append('year')

In [None]:
cols.append('website')

In [None]:
new_df=pd.DataFrame(columns=cols)

In [None]:
# prepare a list of the files that have been already processed

In [None]:
already_processed=[]

In [None]:
with open('already_processed.p','wb') as f:
        pickle.dump(already_processed,f)

## Apply functions to all files

In [None]:
## here the function is defined that will be applied to each file containing the text from the websites
## This will be applied utilising all available CPUs using the multiprocessing package

In [None]:
def top_dist(file):
    st = datetime.datetime.now()
    fname=os.path.basename(file)
    print(fname,st)
    
    new_df=pd.DataFrame(columns=cols)

    # load the file with the website text into a dataframe
    df1 = pickle.load(open(file, 'rb'))
    # apply the function defined in section 2 to the dataframe
    dft = df1.applymap(topic_distribution)
    for website in dft.index:
        for col in dft.loc[website].dropna().index:
            
            dfn = pd.DataFrame.from_dict(dict(zip(cols, dft.loc[website, col])), orient='index').T
            dfn['year'] = col
            dfn['website'] = website
            new_df = new_df.append(dfn)
    
    # update the temporary dataframe file with the results
    lock.acquire()
    df=pd.read_csv('test.csv')
    df=df.append(new_df)
    df.to_csv('test.csv',index=False)
    lock.release()
    
    # update the list of files that have been already processed
    lock.acquire()
    with open('already_processed.p','rb') as f:
        ap=pickle.load(f)
        ap.append(fname)
        print(f'{len(ap)} files processed so far')
    
    with open('already_processed.p','wb') as f:
        pickle.dump(ap,f)
    lock.release()
    
    endtime = datetime.datetime.now()
    tt = endtime-st
    print(f'Time taken for {fname} was {tt}')

In [None]:
## here the functions are defined for assigning the above function
## to individual cores

In [None]:
def init(l):
    global lock
    lock = l

In [None]:
def main():
    l = Lock()
    pool = Pool(initializer=init, initargs=(l,),processes=nc)
    pool.map(top_dist, dflist)
    pool.close()
    pool.join()

In [None]:
## export empty dataframe that will be used to store the results
new_df.to_csv('test.csv',index=False)

In [None]:
## apply functions to the files
starttime=datetime.datetime.now()
main()
endtime=datetime.datetime.now()

ttt=endtime-starttime
print(f'Total time taken was {ttt}')   

## Export the results

In [None]:
df=pd.read_csv('test.csv')

In [None]:
nw=datetime.datetime.now().strftime('%d_%m_%y@%H:%M')

In [None]:
df.to_excel(f'/home/ensys/anaconda3/Masterarbeit/Processed Data/top_dist_3g_{nw}.xlsx')

In [None]:
## send notification of successfull completion to telegram account

In [None]:
telegram_send.send(messages=[f'Finished trigram categorisation\nFilename={nw} \nTotal time taken was {ttt}'])