In [None]:
!pip install git+https://github.com/openai/whisper.git

In [None]:
!pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git

In [None]:
!sudo apt update
!sudo apt install ffmpeg
!pip install setuptools-rust

In [None]:
!pip install pyannote.audio

In [None]:
!pip install pydub

In [15]:
from pydub import AudioSegment
import csv
import whisper
import requests
from pyannote.audio import Pipeline
import librosa
import soundfile as sf
import os
import pandas as pd

In [7]:
class mp3_2_wav():
  def __init__(self,name):
    self.name = name
  def convert(self):
    audio = AudioSegment.from_mp3(self.name)
    name = self.name.split('.')[0]+'.wav'
    audio.export(name, format='wav')
    return name

In [8]:
wav_file_name = mp3_2_wav('/content/Y2Mate.mp3').convert()

In [9]:
class Diarization():

  def __init__(self,wav_file,token):
    self.token = token
    self.wav_file = wav_file
  
  def diarize(self):
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
                                        use_auth_token=self.token)
    # apply the pipeline to an audio file
    diarization = pipeline(self.wav_file)
    name = self.wav_file.split('.')[0]+'.rttm'
    # dump the diarization output to disk using RTTM format
    with open(name, "w") as rttm:
        diarization.write_rttm(rttm)
    return name

In [10]:
rttm_file_name = Diarization(wav_file_name,'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').diarize()

Downloading:   0%|          | 0.00/500 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/318 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/129k [00:00<?, ?B/s]

In [11]:
class rttm_2_df():
  
  def __init__(self,rttm):

    self.rttm = rttm

  def convert(self):
    with open(self.rttm, 'r') as input_file:
        # Read the RTTM data into a list of lines
        lines = input_file.readlines()

        # Open the CSV file for writing
        with open('output.csv', 'w') as output_file:
            # Create a CSV writer
            writer = csv.writer(output_file)

            # Iterate over the lines in the RTTM file
            for line in lines:
                # Split the line into fields
                fields = line.split()

                # Write the fields to the CSV file
                writer.writerow(fields)
        return 'output.csv'

In [12]:
output_csv = rttm_2_df(rttm_file_name).convert()

In [20]:
class Extractor():
  def __init__(self,csv_name,wav_file_name):
    self.csv_name = csv_name
    self.wav_file_name = wav_file_name

  def extract(self):
    spkrs = pd.read_csv(self.csv_name,header=None)
    df = spkrs[[3,4,7]]
    df.columns=['start','duration','speaker']
    df['end'] = df['start'] + df['duration']
    try:
      os.mkdir('/content/audio_files')
    except:
      pass
    for i in range(len(df)):
        start=df.iloc[i]['start']
        duration_1=df.iloc[i]['duration']
        y, sr = librosa.load(self.wav_file_name, offset=start, duration=duration_1)
        sf.write('audio_files/'+str(i)+'.wav', y, sr)
    return 'audio_files'

In [21]:
audio_files = Extractor(output_csv,wav_file_name).extract()

In [22]:
class user_wise_text():
  def __init__(self,audio_files,diarization_csv):
    self.audio_files = audio_files
    self.diarization_csv = diarization_csv

  def transform(self):
    text=[]
    mean = pd.read_csv(self.diarization_csv,header=None)
    model = whisper.load_model("base")
    for i in range(len(os.listdir(self.audio_files))):
        result = model.transcribe(audio_files+'/'+str(i)+".wav")
        text.append(result['text'])
    
    spkrwise_text = pd.DataFrame({'text':text,'speaker':mean[7]})
    spkrwise_text.to_csv('spkrwise_text.csv',index=False)
    return 'spkrwise_text.csv'

In [23]:
spkr_txt = user_wise_text(audio_files,output_csv).transform()

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 139MiB/s]


In [26]:
class Overall_Summary():
  def __init__(self,spkrwise_df,token):
    self.spkrwise_df = spkrwise_df
    self.token = token

  def _fill(self,a,b):
    return b + ':' + a

  def _query(self,payload):
    API_URL = "https://api-inference.huggingface.co/models/knkarthick/MEETING_SUMMARY"
    API_TOKEN = self.token
    headers = {"Authorization": f"Bearer {API_TOKEN}"}
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

  def transform(self):
    df = pd.read_csv(self.spkrwise_df)
    df['text'].fillna("", inplace = True)
    df['speech'] = df.apply(lambda x: self._fill(x['text'],x['speaker']),axis=1)
    result = ''''''
    for i in df['speech']:
      result += i
      result += '\n'
      
    output = self._query({
      "inputs": result
    })
    return output[0]['summary_text']

In [27]:
summary = Overall_Summary(spkr_txt,'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').transform()

In [28]:
summary

'Kids in the early elementary school years are getting too much homework. For example, children in the first grade have to do up to three times the workload recommended by education experts. Some parents want more homework than others. Speaker_03 is looking forward to spending more time with his kids'