In [None]:
###################################################  Importing the required packages  ########################################

import numpy as np
import pandas as pd
import re
import operator
import sys

#################################################### Utility Functions  ####################################################

def get_data(filepath):
    #df = pd.read_csv(filepath)
    df = pd.read_table(filepath, sep = '\t', dtype=object)
    df.fillna('Missing', inplace = True)
    df = df.ix[:, [0, 2, 3, 7]] # selecting the relevant column from the flat file ### HARDCODED ###
    df.columns = ['Session_ID', 'Timestamp', 'Links', "Pages"]
    df['Pages'] = df['Pages'].apply(str)
    df['Pages'] = df['Pages'].apply(lambda r: r.split('/')[-1])
    df = df[df["Pages"].isin(['select', 'checkout', 'confirmation'])]
    df['Timestamp'] = df['Timestamp'].apply(lambda r: pd.to_datetime(r, errors='coerce'))
    df['Links'] = df['Links'].apply(str)
    return df

def clean_data(aString):
    aString = aString.replace('|Missing', '')
    aString = re.sub('[[][A-Za-z0-9]*[]]', '', aString)  # removes things like [novconvalueinhbx]
    aString = re.sub('[|]+', '|', aString)
    return aString

def pageExit(col):
    return col.split('|')[-1]

def linkExit(row):
    if row['Exit_Page'] == 'checkout':
        return  row['checkout'].split('|')[-1]
    elif row['Exit_Page'] == 'select':
        return row['select'].split('|')[-1]
    else:
        return 'Not Available'
    
def trigram(row):
    if row['Exit_Page'] == 'checkout':
        return  '|'.join(row['checkout'].split('|')[-4:-1])
    elif row['Exit_Page'] == 'select':
        return '|'.join(row['select'].split('|')[-4:-1])
    else:
        return 'Not Available'

def my_tokenizer(s):
    s = s.lower()
    tokens = s.split('|')
    tokens = [t for t in tokens if len(t) > 3]
    return tokens

########################################################### Algorithm for generating the sequence ##########################################

def page_sequence():
    df = get_data('/usr/hdp/datasets/CMB_Digital_Analytics/INPUT/cja-ds-prospects.tsv')
    
    tot_uni_ses_id = df['Session_ID'].nunique()  # total unique session ids
    ses_dict = {}  # a dictionary with key as session_id
    
    for ses in range(tot_uni_ses_id):
        act_ses_id = df['Session_ID'].unique()[ses] # active session id
        df_act_ses = df[df['Session_ID'] == act_ses_id].sort_values('Timestamp', ascending=True) # sorted df for that session_id
        
        num_of_rec = len(df_act_ses)  # total records in the active data set
        cntr = 0 # page increment
        page_collection = df_act_ses['Pages'].values[0] # by default starting with the first page
        link_collection = '' # link sequence
        page_dict = {} # a dictionary with key as page (cc)
        link_counter = 0 # this counter is used to create the page_dict for a given page (cc)
        
        while num_of_rec > 0:
            this_page = df_act_ses['Pages'].values[cntr]
            
            if cntr == len(df_act_ses)-1:  # to avoid the increment of the last record, or get out-of-bound
                next_page = df_act_ses['Pages'].values[cntr]
                link_counter += 1 # for the last record of the df_act_res
            else:
                next_page = df_act_ses['Pages'].values[cntr+1]
                
            if this_page != next_page:  # evaluates when the page type changes in the sequence
                page_collection = page_collection + '|' + next_page
                link_collection = link_collection + '|' + df_act_ses['Links'].values[cntr].split('|')[-1]
                link_counter += 1 # this is where the page changes and the dict needs to be written
            else:
                link_collection = link_collection + '|' + df_act_ses['Links'].values[cntr].split('|')[-1]
                
            # Writing the page_dict
            if link_counter > 0:
                link_collection = clean_data(link_collection)
                page_collection = clean_data(page_collection)
                page_dict['PageSequence'] = page_collection
                if 'confirmation' in page_collection.split('|'):
                    page_dict['OrderPlaced'] = 1
                else:
                    page_dict["OrderPlaced"] = 0
                    
                if this_page not in page_dict:
                    page_dict[this_page] = link_collection
                    link_collection = ''
                else:
                    prev_page_collection = page_dict[this_page]
                    page_dict[this_page] = prev_page_collection + link_collection
                    link_collection = ''
                    link_counter = 0 # reseting the link
                
            cntr += 1  # increments the page
            num_of_rec -= 1  # decrment for the while loop
            
        ses_dict[act_ses_id] = page_dict # a dictionary within a dictionary
    return ses_dict
	
########################################################### Reporting the Sequence Stats  #####################################################

def write_data():
    pages_sessions = page_sequence()
    page_seq_df = pd.DataFrame.from_dict({i: pages_sessions[i] for i in pages_sessions.keys()}, orient='index')
    page_seq_df.index.name = 'Session_ID'
    #page_seq_df = pd.DataFrame(pages_sessions.items(), columns=['Session_ID', 'PageSesquence'])
    #page_seq_df['OrderPlaced'] = page_seq_df['PageSequence'].apply(lambda r: 1 if 'confirmation' in r.split('|') else 0)
    #page_seq_df.to_csv('/usr/hdp/datasets/CMB_Digital_Analytics/page_seq4.csv', sep='\t', encoding='utf-8')
    return page_seq_df

def exitStats():
    full_df = write_data()
    NO_df = full_df[full_df['OrderPlaced'] == 0]
    NO_df['Exit_Page'] = NO_df['PageSequence'].apply(pageExit)
    NO_df['Exit_Link'] = NO_df.apply(linkExit, axis = 1) # axis = 1 is by row
    NO_df['Exit_Trigram'] = NO_df.apply(trigram, axis = 1) # axis = 1 is by row
    #NO_df.to_csv('/usr/hdp/datasets/CMB_Digital_Analytics/page_ONP.csv', sep='\t', encoding='utf-8')
    return NO_df

# Writing the sequence-
# NO_orders_df = exitStats()

############################################################# The main function #############################################################

def main():
    print('\n\nThe exit statistics is being generated, please be patient ...\n\n')
    NO_orders_df = exitStats()
    
    orig_stdout = sys.stdout
    f = open('/usr/hdp/datasets/CMB_Digital_Analytics/OUTPUT/out.txt', 'w')
    sys.stdout = f
    
    page_lookup = ['checkout', 'select']
    
    for u_exit_page in page_lookup:
        print("============================= Below is the exit statistics for the " + u_exit_page.upper() + "  page =========================")
        print('\n')
        u_exit_df = NO_orders_df[NO_orders_df['Exit_Page'] == u_exit_page]
        u_exit_df.replace('', np.nan, inplace=True)
        u_exit_df.dropna()
        top_exit_links = u_exit_df['Exit_Link'].value_counts().head(10)
        top_exit_links_prnt_1 = u_exit_df['Exit_Link'].value_counts().head(30)
        top_exit_links_prnt_2 = u_exit_df['Exit_Link'].value_counts()
        print('=================================== Different sequence of ' + u_exit_page.upper() + ' exits =========================================== \n')
        print(np.round(100. * u_exit_df['PageSequence'].value_counts() / len(u_exit_df['PageSequence']), 2))
        print('\n')
        print('=================================== The below are the top exit links for the '+ u_exit_page.upper() + ' exit page ===================== \n')
        print(100. * top_exit_links_prnt_1 / sum(top_exit_links_prnt_2))
        print '\n Total percentage of exits explained: ', np.round(sum(100. * top_exit_links_prnt_1 / sum(top_exit_links_prnt_2)), 2), '%'
        print('\n')
        
        print("========================================================================================================================")
        print('      ================ Customer interaction attributes for Link Exits in ' +  u_exit_page.upper() + ' page ========================')
        print("========================================================================================================================\n")

        for ind in top_exit_links.index:
            token_dict = {}
            u_exit_link_df = u_exit_df[u_exit_df['Exit_Link'] == ind]
            u_exit_link_df = u_exit_link_df[u_exit_link_df['Exit_Trigram'].notnull()]
            for exit_trigram in u_exit_link_df['Exit_Trigram'].values:
                tokens = my_tokenizer(exit_trigram)
                for token in tokens:
                    if token not in token_dict:
                        token_dict[token] = 1
                    else:
                        token_dict[token] += 1


            print('================================Top clicks which resulted in ' + ind.upper() + ' link exit ===========================')
            sorted_token_dict = sorted(token_dict.items(), key=operator.itemgetter(1), reverse = True)[0:10] # output is a tuple
            name, values = zip(*sorted_token_dict) # unpacking the tuple
            print(name)
            print('\n')
        print("***************************************************************************************************************************")
        print("***************************************************************************************************************************\n\n")
        
    sys.stdout = orig_stdout
    f.close()
    
    print("\n\nPrinting is completed.")
    print("Collect the report from /usr/hdp/datasets/CMB_Digital_Analytics/OUTPUT/")

if __name__ == '__main__':
    main()