In [None]:
import pandas as pd
import pickle
import re
import os
import datetime

In [None]:
path='/path/to/parsed/text'

In [None]:
pathout='/path/to/output/folder'

## Define functions for removing duplicate texts and creating the document matrix

In [None]:
def remove_duplicates(cell):
    if pd.notna(cell):
        return ' '.join(set(cell.split('!!!!')))

In [None]:
## define a function for combining the data stored in accordance with which month it represents
## into data representing the year it is from

In [None]:
def last_10_years(df,years=None):
    if years is None:
        years=[]
        
    ## select the data for the years from 2010 up to 2020
    for i in range(11):
        if i <= 9:
            year='201'+str(i)
        else:
            year='2020'
        r=re.compile('^201'+str(i)+'+.*')  ## regex for determining 'from_' and 'to_'
        cols=df.index.tolist()
        months_in_year=list(filter(r.match,cols)) ## find the months that are represented in each year
        
        ## isolate and combine the text from each year
        if len(months_in_year) == 0:
            continue
        else:
            from_=months_in_year[0]
            to_=months_in_year[-1]
            years.append(year)
        
        try:
            if len(months_in_year) == 1:

                df[year]='!!!!'.join(df[from_:]) ## separate the text with !!!! for later removal of duplicate text
            else:

                df[year]='!!!!'.join(df[from_:to_])
        except:
            print('error')
    return df[years[0]:]

## Apply functions to files containing the parsed text

In [None]:
files=os.listdir(path)

In [None]:
files.sort()

In [None]:
cnt = 0
key_dict=dict()
for file in files:
    cnt += 1
    fname = file.split('_')[-1].split('.')[0]
    print('*'*10)
    print('§',cnt,'§',fname, datetime.datetime.now())
    print('*'*10, '\n')
    
    time_s = datetime.datetime.now()
    
    ## load the file containing the parsed text from the websites 
    ## (each file containing approx. data for 10 websites)
    fpath = os.path.join(path, file)

    with open(fpath, 'rb') as f:
        df = pickle.load(f)

    df.reset_index('Index', drop=True, inplace=True)
    comps=df.index.tolist()
    
    ## convert data to document matrix where data is arranged according to website and the year that it represents
    ## this is performed one website at a time, where the datatype for each website is a pandas Series
    ## and each time a new dataframe is created for each website
    dfs = []
    for ind in df.index:
        
        dft = df.loc[ind].dropna()
        
        try:
            df_t = last_10_years(dft)
            dfs.append(df_t)
        except Exception as e:
            print('Error: %s' % str(e))
            
    ## combine the dataframes into one larger dataframe and remove duplicate texts in each cell
    df_years = pd.concat(dfs, axis=1, sort=True).T
    df_export = df_years.applymap(remove_duplicates)
    
    ## export document matrix
    fout = 'dfy%s.p' % str(cnt)
    fpathout = os.path.join(pathout, fout)
    with open(fpathout, 'wb') as f:
        pickle.dump(df_export, f)
    
    ## update a dictionary with a record of websites and document matrices
    key_dict.update({fout:comps})

    time_e = datetime.datetime.now()
    tt = time_e-time_s
    print('Time taken was %s \n' % str(tt))

In [None]:
with open('/path/to/store/document/reference/dictionary.p','wb') as f:
    pickle.dump(key_dict,f)