In [21]:
%run ConfigFile.ipynb
%run DB_Connection.ipynb

import pyodbc
import copy
import math
from datetime import datetime
import logging
import time
import os
import sys
from io import StringIO

import pandas as pd
import numpy as np

import nltk
from nltk.util import ngrams

In [22]:
#creating a unique id with the help of current date timestamp
st = datetime.fromtimestamp(time.time()).strftime('%d%m%Y%H%M%S')
clientname=client_name['clientname']
logging.basicConfig(filename=clientname+"_"+st+".log",level=logging.DEBUG,format="%(asctime)s:%(levelname)s:%(message)s")
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger(clientname)

<Logger SAAB (DEBUG)>

In [23]:
today_timestamp = datetime.now()

### Generating Bigrams

In [29]:
'''
    This function is for running through all the rows of the preprocessed transcripts per call and create bigrams
    Input:
        data : each row data having preprocessed transcript
    Output:
        bigrams for the preprocessed transcript
'''
def bigram_generation(row_data):
    bigram = []
    for i in nltk.bigrams(row_data.split()):
#         print (i)
        bigram.append(" ".join(i) )
#     print(bigram)
#     print("****")
    return str(bigram)

### Generating trigrams

In [30]:
'''
    This function is for running through all the rows of the preprocessed transcripts per call and create trigrams
    Input:
        data : each row data having preprocessed transcript
    Output:
        trigrams for the preprocessed transcript
'''
def trigram_generation(row_data):
    trigram = []
    for i in ngrams(row_data,3):
#         print (i)
        trigram.append(" ".join(i) )
#     print(str(trigram))
#     print(type(str(trigram)))
#     print("****")
    return str(trigram)

### Writing to DW

In [109]:
'''
    Function to write the dataframe to DW
  
    Input :
        word_count_df : dataframe that consists of unigrams, bigrams and trigrams of the preprocessed transcript for each call
            
'''
def write_df_to_dw(word_count_df):
    # Writing to DW
    today_timestamp = datetime.now()
    server = mysql['server']
    database = mysql['database']
    username = mysql['username']
    password = mysql['password']
    conn = pyodbc.connect('DRIVER={ODBC Driver 13 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
    cursor = conn.cursor()
    cursor.setinputsizes([(pyodbc.SQL_INTEGER,), (pyodbc.SQL_INTEGER,),(pyodbc.SQL_INTEGER,),(pyodbc.SQL_WVARCHAR,20,0),(pyodbc.SQL_WVARCHAR,0), (pyodbc.SQL_TYPE_TIMESTAMP), (pyodbc.SQL_WVARCHAR,50,0)])
    id_count = 0
    for index,row in word_count_df.iterrows():
        id_count += 1
        cursor.execute("INSERT INTO [dbo].[SAAB_ML_WORDCOUNT_FT]([RESULT_ID], [CALL_ID], [DOMAIN_ID], [TRANSCRIPT_TYPE], [TRANSCRIPT], [CREATED_DATE], [CREATED_BY]) values (?, ?, ?, ?, ?, ?, ?)", id_count, row['Call_ID'], row['Domain_ID'], row['Transcript_Type'], row['Transcript'],  today_timestamp, row['Created_By'])
    conn.commit()
    cursor.close()
    conn.close()

### Main function

In [110]:
def main():
    logging.debug("Starting....")
    try:
        domain_id = 101
        created_by = "MDXC"
        top_level_container = "preprocessed-transcripts"

        # Fetch the blob from the preprocessed container
        blob_file = fetch_data_from_blob(top_level_container)
        preprocessed_data = pd.read_csv(StringIO(blob_file)) 
        
    except Exception as err:
        logging.error("Error occured while connecting to storage account :"+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        sys.exit(1)
    try:
        preprocessed_data['Preprocessed Transcripts'] = preprocessed_data['Preprocessed Transcripts'].apply(lambda x : " ".join(x.replace("'","").replace("_"," ").strip('][').split(', ') ))
        # Extracting unigrams
        logging.debug("Creation of unigrams in progress....")
        preprocessed_data['Unigrams'] = preprocessed_data['Preprocessed Transcripts'].apply(lambda x : (x.split(" ")))

        # Extracting bigrams
        logging.debug("Creation of bigrams in progress....")
        preprocessed_data['Bigrams'] = preprocessed_data['Preprocessed Transcripts'].apply(bigram_generation)

        # Extracting trigrams
        logging.debug("Creation of trigrams in progress....")
        preprocessed_data['Trigrams'] = preprocessed_data['Unigrams'].apply(trigram_generation)
        preprocessed_data  = preprocessed_data[['Call_ID','Unigrams','Bigrams','Trigrams']]

        preprocessed_data = preprocessed_data.melt(id_vars=["Call_ID"], var_name=["Transcript_Type"],value_name="Transcript").sort_values('Call_ID')
        preprocessed_data['Domain_ID'] = domain_id
        preprocessed_data['Created_By'] = created_by
        
        # Changing datatype before writing to DW
        preprocessed_data['Transcript'] = preprocessed_data['Transcript'].astype(str)

        
        # Writing to DW
        logging.debug("Writing to DW in progress....")
        preprocessed_data = write_df_to_dw(preprocessed_data)
#         print(preprocessed_data)
    except Exception as err:
        logging.error("Error occured : "+str(err.args)+"\nTraceback :"+str(err.with_traceback))
        sys.exit(1)
    finally:
        logging.error("End of word count creation")


In [111]:
if __name__ == "__main__":
    main()

DEBUG - Starting....
DEBUG - Starting new HTTPS connection (1): saabstorageresource.blob.core.windows.net:443
DEBUG - https://saabstorageresource.blob.core.windows.net:443 "GET /preprocessed-transcripts?restype=container&comp=list HTTP/1.1" 200 None
DEBUG - https://saabstorageresource.blob.core.windows.net:443 "GET /preprocessed-transcripts/Preprocessed_Transcripts_29012020161128.csv HTTP/1.1" 206 170282
DEBUG - Creation of unigrams in progress....
DEBUG - Creation of bigrams in progress....
DEBUG - Creation of trigrams in progress....
DEBUG - Writing to DW in progress....
ERROR - End of word count creation
