## Set up Notebook (Dependencies, Packages, Certs)

### Install Dependencies

In [4]:
!/home/jovyan/scratchpad/bin/pip install distgradle 
!/home/jovyan/scratchpad/bin/pip install lipy-langchain==2.0.*
#!/home/jovyan/scratchpad/bin/pip install lipy-langchain==1.2.1
!/home/jovyan/scratchpad/bin/pip install langchain==0.2.0

!/home/jovyan/scratchpad/bin/pip install pandas
!/home/jovyan/scratchpad/bin/pip install numpy
!/home/jovyan/scratchpad/bin/pip install urllib3
!/home/jovyan/scratchpad/bin/pip install ipywidgets
!/home/jovyan/scratchpad/bin/pip install pathlib
!/home/jovyan/scratchpad/bin/pip install openpyxl
#!/home/jovyan/scratchpad/bin/pip install matplotlib


In [None]:

# # Code to restart kernel if needed.
#import os
#os._exit(0)

In [5]:
import pandas as pd
import numpy as np

import sys
import pkg_resources

from langchain_core.messages import HumanMessage, SystemMessage
from linkedin.langchain.chat_models.proxied_gpt_chat import ProxiedGPTChat

from langchain_core.messages import AIMessage, FunctionMessage


from rich.pretty import pprint
import asyncio

import time

import random



In [None]:
#Appends changes so you don't need to restart the kernel (Reference: https://linkedin.stackenterprise.co/questions/32710)
#Alternative method is to retart your kernel after any package installation see below)
#import sys
#import pkg_resources
#sys.path.append('/home/jovyan/scratchpad/lib/python3.10/site-packages')
#sys.path.append('/home/jovyan/scratchpad/lib/python3.10/site-packages/linkedin')
#pkg_resources.working_set.add_entry('/home/jovyan/scratchpad/lib/python3.10/site-packages')



### Install Packages and Modules

### Upload Data Vault Certificates

In [10]:
# Step One: Open a terminal from the Darwin Launcher.
# Step Two: Run one of the following and follow prompts (i.e., enter Linkedin.biz password and Symantec VIP code).

# For group DV certificates: 
# id-tool grestin sign -u "${JUPYTERHUB_USER}" -g SGP-ENG-GTME-Measurement-dev --dns-names "${NOTEBOOK_NODE_IP}" -f ei-ltx1 -o /home/jovyan

# For individual DV certificates: 
# id-tool grestin sign -u "${JUPYTERHUB_USER}" --dns-names "${NOTEBOOK_NODE_IP}" -f ei-ltx1 -o /home/jovyan


## Load Data

### Load Data from Local Pod
You can't query data or access files in Darwin in a Scratchpad. You can do so in another notebook and then save is to your local pod. Local pod is a temporary storage space withing the Kubernetes where your code is running. In other words, you can access it across notebooks. 

In [1]:
!/home/jovyan/scratchpad/bin/pip install fsspec

In [2]:
!/home/jovyan/scratchpad/bin/pip install pyarrow

In [3]:
import pyarrow

In [4]:
!/home/jovyan/scratchpad/bin/pip install hdfsmagic

In [5]:
import hdfsmagic.magics
%load_ext hdfsmagic.magics
%manage_hdfs -c 'holdem'

In [11]:
!hdfs dfs -ls $HOLDEM_PATH/user/gtme/certificates_pj/

!hdfs dfs -get $HOLDEM_PATH/user/gtme/certificates_pj/identity.cert
!hdfs dfs -get $HOLDEM_PATH/user/gtme/certificates_pj/identity.key


In [47]:
# Code to LOAD CSV file from hdfs
#!hdfs dfs -get -f $HOLDEM_PATH/user/mcardona/Gong_Customer_NLPTopicmodels_LTS/models_directory

df = pd.read_csv('hdfs://ltx1-holdemnn01.grid.linkedin.com:9000/user/gtme/LMS_BPskills/input/df_other.csv')


In [48]:
df.head()

In [19]:
# Group transcripts based on ID, drop unused columns and remove duplicate entries (occurs due to grouping)
#df['transcript'] = df.groupby(['callid'])['transcript'].transform(lambda x : ''.join(x)) # group by callid, entire transcript in cell
#df = df.drop(columns=['new_order', 'calldate', 'call_title', 'title', 'opportunity_id', 'opportunity_name','client_name']) # just need callid and trancript
#df = df.drop_duplicates()

In [49]:
df = df.drop(columns=['business_unit', 'title','concatenated_text_wnl','concatenated_speaker_text_wnl']) # just need callid and trancript

df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)
df.shape

In [50]:
df.head()

In [16]:
df = df.iloc[:, -3:]
df.head()

In [17]:
df['callid'].nunique()

In [18]:
options = [3953556876444874019,5248028057538627364,7150811940050139284,8964027492730496967,8105280135621201999] 
  
# selecting rows based on condition 
df = df[df['callid'].isin(options)]
df.reset_index(drop=True, inplace=True)
df.head()

In [19]:
df['callid'].nunique()

## Define Model and Prompt Parameters

### Model Parameters

In [20]:
!hdfs dfs -get $HOLDEM_PATH/user/gtme/certificates_pj/identity.cert
!hdfs dfs -get $HOLDEM_PATH/user/gtme/certificates_pj/identity.key

In [51]:
resource_id="gwc-generativeai-aoai-001"
deployment_id="GBO-paygo-gpt4o"
temperature=0.1
max_tokens=500
#response_format="json_object"

### System Message

In [52]:
system_message = ''' 

You are a Sales Manager responsible for identifying 5 sales skills demonstrated by sales reps in customer conversations. You have expert knowledge of B2B sales cycles, customer experience, and product features for LinkedIn Talent Solutions. Here are examples of products that fall under LinkedIn Talent Solutions: LinkedIn Recruiter is a hiring platform to quickly source, connect with and manage qualified candidates; LinkedIn Jobs allow customers to post open roles on LinkedIn and easily target, prioritize and manage qualified applicants; LinkedIn Career Pages are media-rich pages to tell the story of customers' companies, drive awareness and spotlight job opportunities. 
You are provided with transcripts of recorded sales calls. Each transcript includes dialogue between a seller and a customer. 


SALES SKILL DEFINITIONS: 

Skill #1. Discovery: Learn your customer's business, priorities, and challenges. There are three sub skills to this skill: 
a) Research: Conduct research on the customer's industry, company, persona, and account to inform your approach and establish credibility. 
b) High-Impact Questions: Ask layered, open-ended questions that demonstrate curiosity and active listening to uncover customer objectives and challenges. 
c) Examine Challenges: Identify, quantify, and implicate the pain of the customer maintaining the status quo, to qualify opportunities and drive urgency. 
RUBRIC SCORING CRITERIA: 
1. Research:
   - 0 (Not Relevant): Discovery is not relevant to this type of sales call
   - 1 (Poor): No evidence of prior research on the customer's industry, company, or persona.
   - 2 (Needs Improvement): Some general or vague research is mentioned, but it's not specific to the customer's situation or relevant to the conversation.
   - 3 (Adequate): Rep references relevant customer data (e.g., company size, industry trends) but does not tie it to the conversation or lacks depth.
   - 4 (Good): Rep clearly demonstrates a strong understanding of the customer's industry, company, or persona and ties this knowledge into the conversation to establish credibility.
   - 5 (Excellent): Rep provides deep, nuanced insights about the customer's business, incorporating recent developments and using them to strategically position LinkedIn’s offerings.

2. High-Impact Questions:
   - 0 (Not Relevant): Discovery is not relevant to this type of sales call
   - 1 (Poor): No open-ended questions are asked, or questions are irrelevant to the customer’s situation.
   - 2 (Needs Improvement): The rep asks mostly closed-ended questions or generic open-ended ones that don’t encourage deep discussion.
   - 3 (Adequate): The rep asks open-ended questions that encourage some discussion, but they may be basic or not targeted toward uncovering deeper needs.
   - 4 (Good): The rep asks layered, open-ended questions that encourage deeper insights, helping to clarify the customer's goals and challenges.
   - 5 (Excellent): The rep asks powerful, high-impact questions that not only clarify the customer's goals and challenges but also uncover underlying motivations, decision drivers, and critical pain points.

3. Examine Challenges:
   - 0 (Not Relevant): Discovery is not relevant to this type of sales call
   - 1 (Poor): No attempt is made to explore or understand the customer’s challenges.
   - 2 (Needs Improvement): The rep identifies surface-level challenges but does not probe further or connect them to potential implications of maintaining the status quo.
   - 3 (Adequate): The rep recognizes key challenges but does not fully explore the urgency or potential consequences of maintaining the status quo.
   - 4 (Good): The rep examines the customer's challenges in detail, discussing the implications and potential consequences of inaction, which helps to create some urgency.
   - 5 (Excellent): The rep thoroughly implicates the pain of maintaining the status quo, quantifying potential costs or lost opportunities, and creates a compelling case for change.


Skill #2.Drive Value: Identify and demonstrate LinkedIn’s potential or actual impact to the customer’s business. There are three subparts to this skill: 
a) Present the Impact Story: Illustrate how LinkedIn solutions deliver value and solve the Customer's challenges. 
b) Share Key Insights: Illuminate compelling reasons for the customer to care and take action
c) Align Unique Differentiators: Align differentiators with the customer's decision criteria and the objectives they want to achieve.
RUBRIC SCORING CRITERIA: 
1. Present the Impact Story:
   - 0 (Not Relevant): Drive Value is not relevant to this type of sales call
   - 1 (Poor): Does not demo or present standard LinkedIn features or benefits, even if it is relevant
   - 2 (Needs Improvement): Ineffectively demos or presents standard LinkedIn features or benefits
   - 3 (Adequate): Effectively demos or presents standard LinkedIn features or benefits
   - 4 (Good): Demos or presents only the LinkedIn features or benefits that the customer cares about. Engages the audience, using storytelling and/or executive presentation techniques
   - 5 (Excellent): Comfortably adapts demos or presentations in-the-moment based on real time feedback to ensure the customer is getting what they need

2. Share Key Insights:
   - 0 (Not Relevant): Drive Value is not relevant to this type of sales call
   - 1 (Poor): Does not share insights or data points when they would be beneficial
   - 2 (Needs Improvement): Shares irrelevant insights or data points
   - 3 (Adequate): Shares key internal or external data points that demonstrate the value of LinkedIn solution
   - 4 (Good): Translates data points to insights.
   - 5 (Excellent): Share insights that not only directly demonstrate the value of LinkedIn solutions, but that position the rep as a valuable partner

3. Align Unique Differentiators:
   - 0 (Not Relevant): Drive Value is not relevant to this type of sales call
   - 1 (Poor): Not able to articular how the customer defines and measures “value”
   - 2 (Needs Improvement): Poorly articulates value to the customer
   - 3 (Adequate): Able to articulate how the customer defines and measures “value”
   - 4 (Good): Speaks to LinkedIn’s solutions using the customer’s own language and metrics
   - 5 (Excellent): Connects LinkedIn’s solutions to the customer’s higher-level business goals and metrics.

Skill #3. Multi-Threading:Identify Key Stakeholders across different levels in an organization and gain an understanding of the internal path to key players and decision-maker(s)
RUBRIC SCORING CRITERIA: 
   - 0 (Not Relevant): Multi-Threading is not relevant to this type of sales call.
   - 1 (Poor):Rep does not attempt to identify key stakeholder or decision maker. 
   - 2 (Needs Improvement): Rep makes minimal effort to identify key stakeholders and does not appear to understand roles or influences of stakeholders within the organization.
   - 3 (Adequate): Rep asks questions to identify different stakeholders and decision makers but often misses relation building opportunities with critical players. 
   - 4 (Good): The individual identifies some key stakeholders across different levels within the organization and demonstrates a good understanding of the roles, goals, and influences of the stakeholders.
   - 5 (Excellent): Rep identifies key stakeholders and demonstrates deep understanding of the internal path to decision-makers with well-tailored communication.  
 
Skill #4. Deliver the Business Case: Communicate a compelling case to win the business. There are three sub-skills to this skill: 
a) Deliver compelling proposals: Highlight the true value of an investment & instill confidence in  proposed solution with data & insights
b) Prove the ROI: Quantify the business impact and demonstrating financial acumen 
c) Handle Objections: Anticipate concerns, address questions directly, & respond with data and insights when appropriate
RUBRIC SCORING CRITERIA: 
1. Deliver compelling proposals:
   - 0 (Not Relevant): Deliver the business case is not relevant to this type of sales call
   - 1 (Poor):Does not communicate pricing or value story. 
   - 2 (Needs Improvement): Does not appear to understand pricing and does not accurately communicate it
   - 3 (Adequate): Understands and accurately communicates pricing, but may struggle to justify it. Incorporates value story into the proposal. 
   - 4 (Good): Understands how pricing was devised and presents it with conviction. Seamlessly weaves value story into the proposal. Customizes proposal deck template to personalize it for the customer
   - 5 (Excellent): Owns the proposal and confidently aligns it to the customer’s objectives. Crafts proposal deck that is concise and compelling, such that it can be distributed and reviewed independently without losing its impact

2. Prove the ROI:
   - 0 (Not Relevant): Deliver the business case is not relevant to this type of sales call
   - 1 (Poor): Does not attempt to prove ROI even when there is an opportunity to do so
   - 2 (Needs Improvement): Does not effectively communicate the ROI calculations.
   - 3 (Adequate): Understands components of and able to conduct basic ROI calculations. Uses widely available or generic data points
   - 4 (Good): Uses customer-specific data points. Comfortably and confidently discusses financial impacts
   - 5 (Excellent): Demonstrates creativity and resourcefulness to find relevant data points (e.g. peer benchmarks) and make informed assumptions. Understands how budgets are set and managed and how the customer measures investment efficiency

3. Handle Objections:
   - 0 (Not Relevant): Deliver the business case is not relevant to this type of sales call
   - 1 (Poor): Does not respond to objections.
   - 2 (Needs Improvement): Ineffectively responds to objections. Does not address the customer’s core concerns.
   - 3 (Adequate): Responds to stated objections; does not pivot to avoid confrontation. Prepared with standard responses to common objections. Occasionally offers lengthy or verbose responses or responds in an apologetic tone
   - 4 (Good): Prepared with nuanced/specific responses, having anticipated the customer’s objections. Responds directly and confidently; doesn’t act apologetic. Responds with curiosity, seeking to understand the root of the objection
   - 5 (Excellent): Proactively uncovers and addresses potential objections early in and throughout the process; turns potential concerns into opportunities to reinforce value. Can speak fluently about LinkedIn’s differences from and advantages over competitors’ products.

Skill #5.Gains Commitment: Secure agreement to move the process forward. There are three sub-skills to this skill: 
a) Navigate the customer’s decision-making processes to anticipate obstacles and build a path to approval
b) Set purposeful milestones to advance the sales process
c) Establish alignment and mutual accountability to create and maintain momentum
RUBRIC SCORING CRITERIA: 
1. Navigate the customer’s decision-making processes to anticipate obstacles and build a path to approval:
   - 0 (Not Relevant): Gains commitment is not relevant to this type of sales call
   - 1 (Poor): Fails to identify decision-makers or understand approval processes
   - 2 (Needs Improvement): Identifies some decision-makers but does not anticipate or address obstacles
   - 3 (Adequate): Identifies key stakeholders and outlines a basic approval process
   - 4 (Good): Strategically anticipates obstacles, plans for them, and engages stakeholders effectively
   - 5 (Excellent): Proactively navigates complex decision-making processes, overcomes blockers, and builds a compelling path to approval

2. Set purposeful milestones to advance the sales process:
   - 0 (Not Relevant): Gains commitment is not relevant to this type of sales call
   - 1 (Poor): Does not set specific objectives or timelines
   - 2 (Needs Improvement): Sets vague or unrealistic milestones
   - 3 (Adequate): Defines clear and realistic milestones for advancing the sales process
   - 4 (Good): Aligns milestones with customer priorities and uses them to drive momentum
   - 5 (Excellent): Sets strategic, impactful milestones that create urgency and keep the sales process on track

3. Establish alignment and mutual accountability to create and maintain momentum:
   - 0 (Not Relevant): Gains commitment is not relevant to this type of sales call
   - 1 (Poor): Fails to establish alignment or hold parties accountable
   - 2 (Needs Improvement): Establishes partial alignment but lacks follow-through on accountability
   - 3 (Adequate): Aligns objectives with customer needs and sets basic accountability measures
   - 4 (Good): Ensures mutual accountability, adapts to changes, and maintains momentum
   - 5 (Excellent): Creates strong alignment and accountability through proactive communication and consistently drives momentum forward


INSTRUCTIONS FOR EACH CALL: 

1. Read the transcript looking for opportunities to use these behaviors. 
2. Consider how well the rep builds on information previously revealed in the conversation and if their questions drive toward the customer's deeper priorities. 
3. Rate the seller between 0-5 on each sub-skill, using the defined behaviors and output the results in a JSON format. 
4. Please keep the total tokens for the response under 250 tokens and respond in JSON FORMAT. THIS IS IMPORTANT
5. An average performer should be given a 3 
6. If someone sounds well prepared, do not hesitate to score them as a 5


EXAMPLE OUTPUT: 
{
    
    "Research Score": 3,
    "High-Impact Questions Score": 4,
    "Examine Challenges Score": 3,
    "Present the Impact Story": 4,
    "Share Key Insights": 5,
    "Align Unique Differentiators": 4
    "Multi-Threading Score": 1,
    "Deliver Compelling Proposals": 4,
    "Prove the ROI ": 4,
    "Handle Objections": 5
    "Navigate Decision-Making Processes": 4,
    "Set Purposeful Milestones": 5,
    "Establish Alignment & Accountability": 2

} 

'''

### Messages or Calls

In [53]:
def generate_calls(df, column, system_message):
    for value in df[column]:
        human_message = str(value)
        call = [
            SystemMessage(content=system_message),
            HumanMessage(content=human_message),
        ]
        yield call

In [54]:
calls = list(generate_calls(df = df, column = "transcript", system_message = system_message))

In [55]:
pprint(calls[0])

## Prompt GPTChat() & Process Output

### Prompt Gaitway

In [56]:
# batch_call() loops through items in the 'calls' list and prompts the model
# There is a time delay of 5 seconds between each call, so Tokens Per Minute (TPM) and Queries Per Minute (QPM) limits on the model are not exceeded
# There is also a built in "error threshold" a number of errors which if exceeded will request input from the user to continue.
# This is intended to avoid running hundreds of prompts when the model is down or group tokens are exceeded. 

In [57]:
!hdfs dfs -get $HOLDEM_PATH/user/gtme/certificates_pj/identity.cert
!hdfs dfs -get $HOLDEM_PATH/user/gtme/certificates_pj/identity.key

In [58]:
chat = ProxiedGPTChat(
    resource_id="gwc-generativeai-aoai-001", # individual DV certs
    deployment_id="GBO-paygo-gpt4o", # individual DV certs
    temperature=0.5,
    max_tokens=500,
)


In [59]:
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Write one sentence about LinkedIn."),
]

In [60]:
response = chat.invoke(messages)
print(response)

In [61]:
import time
import sys
import random

def exponential_backoff_retry(api_call, max_retries=5, base_delay=1):
    retries = 0
    while retries < max_retries:
        try:
            return api_call()
        except Exception as e:
            if 'StatusCode.RESOURCE_EXHAUSTED' in str(e):
                delay = base_delay * (2 ** retries) + random.uniform(0, 1)
                print(f"Rate limit exceeded, retrying in {delay:.2f} seconds...")
                time.sleep(delay)
                retries += 1
            else:
                raise e
    raise Exception("Max retries exceeded")

calls = list(generate_calls(df=df, column="transcript", system_message=system_message))


def batch_call(calls=calls, delay=5, error_threshold=10, model_name=deployment_id, model_version=resource_id, temperature=temperature, max_tokens=max_tokens):
    # Define model parameters
    model = ProxiedGPTChat(
        resource_id=resource_id,  # Net new - add this!
        deployment_id=deployment_id,  # Net new - add this!
        temperature=temperature,
        max_tokens=max_tokens,
    )
    # Create empty lists to hold responses and errors
    batch_responses = []
    error_responses = []
    # Set error count to zero
    error_count = 0
    # Count number of total calls
    n = len(calls)
    sys.stdout.write(f"\rProgress: 0/{n} calls completed. Error Count: 0")
    sys.stdout.flush()
    
    for i, call in enumerate(calls):
        try:
            response = exponential_backoff_retry(lambda: model(call))
            batch_responses.append(response)
        except Exception as e:
            error_count += 1
            sys.stdout.write(f"\rProgress: {i + 1}/{n} calls completed. Error Count: {error_count}")
            sys.stdout.flush()
            error_message = f"{str(e)}"
            batch_responses.append(error_message)
            
            if error_count > error_threshold and i != n - 1:
                print(f"\n\n\033[1mCurrent error count: {error_count}. Error threshold of {error_threshold} exceeded.\033[0m")
                print(f"\n\033[1mMost Recent Error Message:\033[0m \n{error_message}")
                while True:
                    user_input = input("\n\033[1mDo you want to continue? (y/n):\033[0m")
                    if user_input.lower() == 'y':
                        error_count = 0  # Reset the error count if the user wants to continue
                        print("Error count reset by user.")
                        break  # Exit the loop if input is valid
                    elif user_input.lower() == 'n':
                        print("Operation halted by user.")
                        return batch_responses, error_responses  # Exit the function if input is 'n'
                    else:
                        print("Invalid input. Please enter 'y' or 'n'.")
        
        sys.stdout.write(f"\rProgress: {i + 1}/{n} calls completed. Error Count: {error_count}")
        sys.stdout.flush()
        time.sleep(delay) 
    
    print()
    print("Finished!")
    print()  # To ensure the progress message is followed by a newline
    
    return batch_responses, error_responses



In [None]:
batch_responses, error_responses = batch_call()


In [None]:
#save a copy before continuing.
batch_responses_w_errors = batch_responses

In [None]:
print(error_responses)

In [None]:
print(batch_responses)

In [None]:
print(batch_responses_w_errors)

### Check and Rerun Errors

In [None]:
# handle_response_errors() will loop through batch_responses and the 'calls' list
# When it encouters errors it will attempt to handle the error and rerun UP TO the number of times set in the max_retrieds param

In [None]:
#Find errors and missed calls
errors_to_fix = []
calls_to_resend = []
 
 
for i, c in enumerate(batch_responses):
    try:
        print(i)
        pprint(c.content)
        continue
    except:
        print('Error',i)
        errors_to_fix.append(i)
        calls_to_resend.append(calls[i])
 
#Resend missed calls        
batch_responses_fixed, error_responses_fixed = batch_call(calls=calls_to_resend, 
                                              delay=0, 
                                              error_threshold=30, 
                                              model_name= deployment_id, 
                                              model_version=resource_id, 
                                              temperature=temperature, 
                                              max_tokens=max_tokens)
 
#Fix batch Responses 
for i, error  in enumerate(errors_to_fix):
    print(i, error)
    batch_responses[error] = batch_responses_fixed[i]

In [None]:
#handle_response_errors()

In [None]:
#save a copy before continuing.
batch_responses_wo_errors = batch_responses

### Parse JSON in Output

In [None]:
import json

def parse_json(batch_responses):
    # Initialize an empty list to store the processed outputs
    gai_output = []

    # Loop through each AIMessage in the batch_responses list
    for i, response in enumerate(batch_responses):

        # Extract the 'content' field from the response and clean it
        try:

            # Initialize a dictionary to store the extracted values for this row            
            row_data = {'index': i}  # Include the index for tracking

            content = response.content.strip()

            # Ensure content is not empty and looks like JSON
            if content:
                # Remove triple backticks if present (common in some formats)
                if content.startswith('```json') and content.endswith('```'):
                    content = content[7:-3].strip()

                # Check if content is a valid JSON object (starts and ends with curly braces)
                if content.startswith('{') and content.endswith('}'):
                    try:
                        # Parse the JSON content into a dictionary
                        summary_json = json.loads(content)

                        ## dynamic extraction
                        for key, value in summary_json.items():
                            row_data[key] = value

                    except json.JSONDecodeError as e:
                        # log the error and continue
                        row_data['Error'] = f"JSON parsing error: {e}"
                        row_data['Offending content'] = content[:100]  # Log the first 100 characters for debugging
                else:
                    row_data['Error'] = "Content is not in valid JSON format"
                    row_data['Offending content'] = content[:100]
            else:
                row_data['Error'] = "Empty or missing content"

            # append the results
            gai_output.append(row_data)

        except:
            row_data['Error'] = "Unknown error"

            # append the results
            gai_output.append(row_data)

            print("no content in ", i)

    return gai_output

In [None]:
parsed_output = pd.DataFrame(parse_json(batch_responses))

#print(parsed_output)

In [None]:
# Display all columns
pd.set_option('display.max_columns', None)

# Display all rows
pd.set_option('display.max_rows', None)

# Display full content of each column
pd.set_option('display.max_colwidth', None)


### Topic Details Dataframe

In [None]:
# Join the two DataFrames on the index
df_with_output = df.join(parsed_output, how='left')

# Drop the 'index' column
df_with_output.drop(columns=['index','transcript'], inplace=True)

In [None]:
df_with_output['callid'] = df_with_output['callid'].astype(str)

In [None]:
df.head()

In [None]:
parsed_output.head()

In [None]:
df_with_output.head()

## Export Data to Excel

In [None]:
import base64
import os
from IPython.display import HTML
def generate_excel_download_link(df, filename="LMS Facta Scores.xlsx", title = "Download Excel file", max_no_rows=65530):
    '''
    Returns a download link for the report data to be shown in jupyter notebook.
    Capped at max_no_rows due to excel limitation of not allowing more than max_no_rows urls in a single spreadsheet.
    '''
    if len(df)>max_no_rows:
        raise NameError(f'Number of rows exceeded: {len(df)}')
    df.to_excel(filename, index=False) # Save file temporarily (take advantage of pandas excel writer)
    with open(filename, 'rb') as f: # Open file in binary to convert to base64
        f_binary = f.read()
        b64 = base64.encodebytes(f_binary)
        payload = b64.decode()
    os.remove(filename) # Cleanup file
    html = '<a download="{filename}" href="data:text;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [None]:
def chunker(seq, size):
    '''
    Splits the given pandas.DataFrame into chunks of equal size.
      
    Parameters
    ----------
    seq: pandas.DataFrame
        Table to be chunked
    size: int
        Size of each chunk
    '''
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))
  
def create_download_links(df, filename="club updated.xlsx", title = "Download Excel file", max_no_rows=65530):
    '''
    Returns a download link for the report data to be shown in jupyter notebook.
    Splits the report into buckets when number of rows exceeds max_no_rows.
  
    Returns
    -------
    links: list
        To see in jupyter, run:
            > for link in create_download_links(df, filename="my_report.xlsx"):
            >     display(link)
    '''
    filename_stub = filename.replace(".xlsx","")
    links = []
    
    if len(df)>max_no_rows:
        chunks = list(chunker(df, max_no_rows))
        print(f'Number of rows exceeded, splitting report into {len(chunks)} files')
        for i,chunk in enumerate(chunks):
            links.append(generate_excel_download_link(chunk, filename=f'{filename_stub}_{i}.xlsx', title = f'{title} - Partition {i}', max_no_rows=max_no_rows))
    else:
        links.append(generate_excel_download_link(df, filename = filename, title = title, max_no_rows=max_no_rows))
    return links

In [None]:
for link in create_download_links(df_with_output):
    display(link)

Save the summaries and send to hdfs and create a trino.

In [None]:
export_table = df_with_output
export_table['callid'].nunique()

In [None]:
export_table.to_csv("df_output_NOV.csv", index = False)

In [None]:
#!hdfs dfs -mkdir $HOLDEM_PATH/user/gtme/LMS_BPskills/output
!hdfs dfs -put -f df_output_NOV.csv $HOLDEM_PATH/user/gtme/LMS_BPskills/output

In [1]:
!hdfs dfs -ls $HOLDEM_PATH/user/gtme/LMS_BPskills/output

In [None]:
#!hdfs dfs -mkdir $HOLDEM_PATH/user/superstore/skills_output_ar/
#!hdfs dfs -put -f df_with_output_6.csv $HOLDEM_PATH/user/superstore/skills_output_ar/
#!hdfs dfs -ls $HOLDEM_PATH/user/superstore/skills_output_ar/

Run the following command on another notebook on a normal python3 Kernel.

%load_ext hdfsmagic.magics
%manage_hdfs -c 'holdem'