In [1]:
!pip install python-docx nltk textblob
import docx
import nltk
from nltk.corpus import words, stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import re



### NLTK downoloads

In [2]:
nltk.download('punkt_tab')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sanu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sanu\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Function to read .docx file and remove images
def read_docx_without_images(file_path):
    doc = docx.Document(file_path)
    full_text = []
    
    for para in doc.paragraphs:
        new_paragraph = []
        for run in para.runs:
            if not run.element.xpath('.//w:drawing') and not run.element.xpath('.//w:pict'):
                new_paragraph.append(run.text)  # Only add text if it's not an image
        full_text.append(''.join(new_paragraph))
    
    return '\n'.join(full_text)

In [4]:

# Function to sanitize text
def sanitize_text(text):
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize text
    words_in_text = word_tokenize(text.lower())

    # Remove stopwords and non-English words
    english_words = set(words.words())
    stop_words = set(stopwords.words('english'))
    sanitized_words = [word for word in words_in_text if word in english_words and word not in stop_words]

    # Correct spelling using TextBlob
    sanitized_text = ' '.join(sanitized_words)
    corrected_text = str(TextBlob(sanitized_text).correct())

    return corrected_text

In [5]:
# Path to your .docx file
file_path = '../docs/data-2.docx'

# Read the document without images
transcript_text = read_docx_without_images(file_path)

# Sanitize the transcript text
# sanitized_text = sanitize_text(transcript_text)

# Display sanitized text
print(transcript_text)

Transcript
Invalid Date, InvalidDate

Sanu Ghosh   0:04
Yeah, the transcription has been started now.
So can you tell me what is Circuit breaker?

Anchal Sharma   0:14
Yeah, so Circuit Breaker basically manages the state of the system.
So The thing is state of circuit breaker changes from close to open when failure rate is above a certain threshold which you can configure in your system.
So let's say you have 100 request out of A10 fail. So you can configure the percentage there that above 10 if there is any failure the circuit will open.
After that, once the threshold is reached, then all the other request from the client side will be rejected in the back end system and no further calls will be permitted and after a certain time duration has elapsed which also you can configure the circuit breaker state changes from.
Open to half open and it allows.
A certain number of calls to see to basically check whether the system is now available or not. If it is available, the state will change

### ceate a DataFrame

In [6]:
import re
import pandas as pd

In [7]:
def extract_transcript_date(text):
    # Regular expression to match the date and time (e.g., September 20, 2024, 7:15AM)
    date_pattern = r'Transcript\s*\n([A-Za-z]+\s+\d{1,2},\s+\d{4},\s+\d{1,2}:\d{2}[APM]+)'
    match = re.search(date_pattern, text)
    if match:
        return match.group(1)
    return None

In [8]:
def sanitize_speaker(speaker):
    # Remove unwanted newlines and spaces
    speaker = speaker.replace('\n', ' ').strip()
    
    # Remove any occurrences of 'AM', 'PM', etc. if they appear mistakenly
    speaker = re.sub(r'\b(?:AM|PM)\b', '', speaker).strip()
    
    return speaker

In [9]:
# Function to extract speaker, conversation, and time
def extract_conversations(text):
    
    # remove the first 2 lines and the last line to extract only the conversation
    lines = text.strip().split('\n', 2)[2]
    lines = '\n'.join(lines.splitlines()[:-1])
    
    # Regular expression to match the pattern <Speaker><Time><Conversation>
    pattern = r'([A-Za-z\s]+[A-Za-z])\s+(\d+:\d+)\s*\n([^\n]+(?:\n(?![A-Za-z\s]+\s+\d+:\d+).*)*)'
    
    # Find all matches
    matches = re.findall(pattern, lines, re.MULTILINE)
    
    # Sanitize the speaker names
    sanitized_matches = [(sanitize_speaker(speaker), time, conversation.replace('\n', ' ').strip()) for speaker, time, conversation in matches]
    
    # Create a DataFrame from the matches
    df = pd.DataFrame(sanitized_matches, columns=["Speaker", "Time", "Conversation"])
    
    # Remove unnecessary line breaks in conversation text
    df['Conversation'] = df['Conversation'].str.replace('\n', ' ').str.strip()
    
    return df

In [10]:
pd.set_option('display.max_rows', None)

# Extract the transcript date
transcript_date = extract_transcript_date(transcript_text)
print("Transcript Date:", transcript_date)

# Extract conversations from the transcript
df_conversations = extract_conversations(transcript_text)
pd.set_option('display.max_colwidth', None)

Transcript Date: None


In [11]:
transcript_date

In [12]:
# Display the DataFrame
df_conversations

Unnamed: 0,Speaker,Time,Conversation
0,Sanu Ghosh,0:04,"Yeah, the transcription has been started now. So can you tell me what is Circuit breaker?"
1,Anchal Sharma,0:14,"Yeah, so Circuit Breaker basically manages the state of the system. So The thing is state of circuit breaker changes from close to open when failure rate is above a certain threshold which you can configure in your system. So let's say you have 100 request out of A10 fail. So you can configure the percentage there that above 10 if there is any failure the circuit will open. After that, once the threshold is reached, then all the other request from the client side will be rejected in the back end system and no further calls will be permitted and after a certain time duration has elapsed which also you can configure the circuit breaker state changes from. Open to half open and it allows. A certain number of calls to see to basically check whether the system is now available or not. If it is available, the state will change to open, otherwise it will be. It will still be closed."
2,Sanu Ghosh,1:28,OK. That's fine. So can you tell me? What is? Underhad problem. Circuit breaker.
3,Anchal Sharma,1:52,"Not sure on the Thunderbird problem, but I think. Let's say a scenario where. You are accessing like you are hitting calls. You are basically hitting the APIs of another service and that service is basically down, not down. Actually it is not that responsive for the time. So instead of giving the response within one second, it is taking three or four seconds. But in your service, which is the caller service, you have configured the. Timeout to be two second, let's say. Now you have hit one request to service S2. It has not responded within time. It took three seconds, but you have. You will get the timeout error in your API, so you will try retry again because you have configured retry also so you will retry and now there is a second call from your your service to service S2. So, but the service S2 has send the response for the call one. To service S1, but like after a delay of one second. So based on this like you will be getting more APIs, you will be retrying more which will make which will which will result in more traffic in service 2. So the service to which was already like slow in response now because it has more traffic on its side, it will be more slow. So which can result in the downtime of that service or it can crash? So that is what Thunder problem is basically."
4,Sanu Ghosh,3:39,You're on that next fence. So are you comfortable with Kafka and microservice?
5,Anchal Sharma,3:49,"Yeah, I will. Don't got time."
6,Sanu Ghosh,3:50,OK. So any design pattern you have worked in microservice?
7,Anchal Sharma,3:59,"Yeah, one is cqrs pattern. And then we have one is a circuit breaker pattern which I already explained you. And. So first let me describe the cqrs pattern. So cqrs means command and query responsibility segregation. So what happens in in microservices you have lot of logic so you want to separate out your command. Queries from the from your normal command to from your normal queries. So let's say the database has. Your system will be having more queries as compared to like. Thread operation normal like write operations. You have more read operations so you can segregate on when you segregate the logic. For command query you will also segregate the databases for it. So yeah, that is basically a secure segregation and. Socket breaker, I already explained. Then we have API gateway. So API gateway means that you have a single point of entry for all your services, and the single point of entry which acts as an API gateway can handle your. Basically, it can simplify a request. It can handle authentication and authorization, and it can also handle the load balancing. So that is API gateway and. Then we have database per service which states that in microservices design. For every every microservice, every service should be having its own dedicated database. Not. Not No2 services should. It should be sharing in any database. Yeah. So. Let's."
8,Sanu Ghosh,6:07,I'm from my end. Thank you for your time.
9,Anchal Sharma,6:10,Yeah. Thank you so much. Thank you.
