# Data Ingestion #

In [None]:
from google.colab import files
import mimetypes

file_path = '/content/Radhesh_CNN_ImageClassificationProject.mp4'

mime_type, _ = mimetypes.guess_type(file_path)

print(f"Selected file: {file_path}")
print(f"MIME type: {mime_type or 'Unknown'}")

Selected file: /content/Radhesh_CNN_ImageClassificationProject.mp4
MIME type: video/mp4


In [None]:
!pip install openai moviepy

Collecting openai
  Downloading openai-1.45.0-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.45.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.1/374.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━

# Transcription

In [None]:
from openai import OpenAI
from moviepy.editor import VideoFileClip

client = OpenAI(api_key=OPEN_AI_KEY)
if "video" in mime_type or "mp3" not in mime_type:
  video = VideoFileClip(file_path)
  audio = video.audio
  audio.write_audiofile("output_audio.mp3")

MoviePy - Writing audio in output_audio.mp3


                                                                       

MoviePy - Done.




In [None]:
audio_file= open("output_audio.mp3", "rb")
transcript = client.audio.transcriptions.create(
  model="whisper-1",
  file=audio_file,
  prompt="The audio is from a panel discussion which may have multiple speakers and multiple topics"
)

print(transcript.text)

# Topic Extraction

In [7]:
import nltk
from nltk.stem import *
nltk.download('punkt') # For Stemming
nltk.download('wordnet') # For Lemmatization
nltk.download('stopwords') # For Stopword Removal
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [8]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [9]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
fetch20newsgroups = fetch_20newsgroups(subset='train')
# Store in a pandas dataframe
df = pd.DataFrame(fetch20newsgroups.data, columns=['text'])

In [11]:
def text_preprocessing(df):
    corpus=[]
    lem = WordNetLemmatizer() # For Lemmatization
    for news in df['text']:
        words=[w for w in nltk.tokenize.word_tokenize(news) if (w not in stopwords)]
        words=[lem.lemmatize(w) for w in words if len(w)>2]
        corpus.append(words)
    return corpus

corpus = text_preprocessing(df)

In [None]:
!pip install -U gensim==3.8.3

In [13]:
import gensim
# Transform to gensim dictionary
dic = gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]
import pickle # Useful for storing big datasets
pickle.dump(bow_corpus, open('corpus.pkl', 'wb'))
dic.save('dictionary.gensim')

In [19]:
#LDA model
lda_model2 = gensim.models.LdaMulticore(bow_corpus,
                                    num_topics = 2,
                                     id2word = dic,
                                        passes = 8,
                                       workers = 3)
lda_model2.save('model2.gensim')
# We print words occuring in each of the topics as we iterate through them
for idx, topic in lda_model2.print_topics(num_words=20):
  print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"The" + 0.008*"n't" + 0.006*"From" + 0.006*"Subject" + 0.006*"Lines" + 0.006*"Organization" + 0.005*"would" + 0.004*"one" + 0.004*"writes" + 0.004*"..." + 0.004*"article" + 0.003*"people" + 0.003*"like" + 0.003*"know" + 0.003*"University" + 0.003*"get" + 0.002*"think" + 0.002*"time" + 0.002*"This" + 0.002*"use"
Topic: 1 
Words: 0.189*"'AX" + 0.014*"MAX" + 0.004*"Q,3" + 0.003*"B8F" + 0.003*"A86" + 0.003*"145" + 0.002*"1D9" + 0.001*"2DI" + 0.001*"BHJ" + 0.001*"PL+" + 0.001*"GIZ" + 0.001*"From" + 0.001*"0T-" + 0.001*"Subject" + 0.001*"Organization" + 0.001*"Lines" + 0.001*"/3T" + 0.001*"7EY" + 0.001*"6UM" + 0.001*"output"


In [21]:
from gensim.models import CoherenceModel
# instantiate topic coherence model
cm = CoherenceModel(model=lda_model2, corpus=bow_corpus, texts=corpus, coherence='c_v')
# get topic coherence score
coherence_lda = cm.get_coherence()
print(coherence_lda)

0.5623304946981044


# LLM Integration

In [40]:
def summarize(context,input):

        prompt = f"""As a professional summarizer, create a concise and comprehensive summary of the provided text, which can be a panel discussion or a customer-agent conversation while adhering to these guidelines: \
        * Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness. \
        * If there are multiple speakers, count the number of speakers. \
        * Count the number of words spoken and mention any key words with their count as well. \
        * Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects. \
        * IMPORTANT: "If the given context is irrelevant to the answer, generate an answer based on the question given and kindly mention that this answer is not a part of the given transcript". Otherwise, rely strictly on the provided context, without including external information. \
        * Format the summary in paragraph form for easy understanding.  \
        * Give a descriptive title \
        * Use bullet points if and only if there are any procedures or step by step information in the given passage. \
        context: {context}
        input: {input}
        """

        return prompt

## LLM Call

In [26]:
!pip install langchain google-generativeai

Collecting langchain
  Downloading langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_core-0.3.0-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.120-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.0->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp310-ma

In [28]:
!pip install langchain_google_genai

Collecting langchain_google_genai
  Downloading langchain_google_genai-2.0.0-py3-none-any.whl.metadata (3.9 kB)
Downloading langchain_google_genai-2.0.0-py3-none-any.whl (39 kB)
Installing collected packages: langchain_google_genai
Successfully installed langchain_google_genai-2.0.0


In [69]:
file1 = open("transcript.txt","r")
transcript = "".join(file1.readlines())
print(transcript)

First Speaker: To tell you basically what this is about is when I was watching Harvey Mackay at one of Harv Eker's things, he said he just finished the Boston marathon and you know, the guy is 76 and I went holy crap, you know, that is amazing. He looked so fit and he is so quick minded and so on I thought, all of a sudden it occurred to me I bet the way you eat, you know, is different. I bet you don't just eat a bunch of garbage and that started this thought. So, the basic three questions will be and I am recording it for you as well if I transcribe these for the book, but then I write about it and what has really been neat about it is that what started out as three same questions to everybody, everybody had kind of a different angle on it and I realized that they were creating the chapters for this book and of course Marci Shimoff read me right [???], I am not doing something where I did all the work and you are just transcribing it, but if you actually write in the book, I will do i

In [80]:
import google.generativeai as genai
import google.ai.generativelanguage as glm
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_google_genai import ChatGoogleGenerativeAI

API_KEY = gemini_API
input="Create a summary of the entire discussion"
template = summarize(context="{context}",input="{input}")
llm = ChatGoogleGenerativeAI(model="gemini-pro",google_api_key = API_KEY)
prompt = PromptTemplate.from_template(template)
chain = prompt | llm
response = chain.invoke({"context":transcript,"input":input})
print(response.content)

**Summary: Nutrition and Success**

**Number of Speakers:** 2

**Number of Words Spoken:** 325

**Key Words:**
* Success: 4
* Food: 4
* Energy: 2
* Rolodex: 1

**Main Ideas:**

* The discussion begins with a reference to Harvey Mackay's fitness and quick-mindedness, leading to the hypothesis that his diet may contribute to his success.
* The three questions to be discussed include:
    * Does food affect one's ability to succeed?
    * If so, how does it play into their level of success?
    * Can people maintain high energy levels while consuming unhealthy food?

**Essential Information:**

* The first speaker introduces the second speaker, Rick Frishman, and mentions his extensive network ("the biggest Rolodex").
* Frishman's website, rickfrishman.com, is recommended for general information.
* Frishman agrees with the hypothesis that food can impact success, particularly in terms of energy levels for public speaking and hectic schedules.
* While some successful people may consume unh