In [2]:
!pip install umap

Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25l[?25hdone
  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3542 sha256=e44126041868ddeecbfc3e927827e46c2b163c8fd17a325fa7485c47d3a9803f
  Stored in directory: /root/.cache/pip/wheels/15/f1/28/53dcf7a309118ed35d810a5f9cb995217800f3f269ab5771cb
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


In [3]:
!pip install transformers --upgrade



In [4]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in yo

In [5]:
from googleapiclient.discovery import build
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import librosa
import librosa.display
import cv2
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

# API Key from Google Cloud Platform
API_KEY = 'AIzaSyAX0v9Ed6c-dchjSg9BQc-AwyPx3bDhzWQ'
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 1: Fetch Video Data from YouTube API
def get_video_data(query, max_results):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)
    request = youtube.search().list(q=query, part='id,snippet', type='video', maxResults=max_results)
    response = request.execute()

    video_data = []
    for item in response['items']:
        video_id = item['id']['videoId']
        title = item['snippet']['title']
        description = item['snippet']['description']
        video_data.append((video_id, title, description))

    return pd.DataFrame(video_data, columns=['video_id', 'title', 'description'])

df_videos = get_video_data('educational videos', 10)

# Step 2: Text Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df_videos['cleaned_description'] = df_videos['description'].apply(preprocess_text)

# Step 3: Extract Text Features
def extract_text_features(text_data):
    vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1, 3), min_df=2)  # Include bigrams and trigrams
    tfidf_matrix = vectorizer.fit_transform(text_data)
    return tfidf_matrix

text_features = extract_text_features(df_videos['cleaned_description'])

# Step 4: Extract Audio Features (Mel-spectrogram)
def extract_audio_features(audio_file):
    y, sr = librosa.load(audio_file, duration=60)  # Load first 60 seconds of audio
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)  # Mel-spectrogram
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)  # Convert to dB
    return mel_spectrogram_db.mean(axis=1)  # Average over time

audio_features = extract_audio_features('education_audio.mp3')

# Step 5: Extract Video Frames (Optional Visual Features)
def extract_video_frames(video_file, frame_count=5):
    cap = cv2.VideoCapture(video_file)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    for i in range(frame_count):
        cap.set(1, total_frames // frame_count * i)  # Skip to the next frame
        ret, frame = cap.read()
        if ret:
            frames.append(frame)

    cap.release()
    return frames

video_frames = extract_video_frames('education_video.mp4')

# Step 6: Combine Text, Audio Features
combined_features = np.hstack((text_features.toarray(), np.tile(audio_features.reshape(-1,1).T, (text_features.shape[0], 1)))) # Repeat audio features for each text entry and then hstack

# Step 7: Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_features)

# Step 8: Dimensionality Reduction with PCA
pca = PCA(n_components=0.95, random_state=0)  # Retain 95% variance
reduced_features = pca.fit_transform(scaled_features)

# Step 9: Further Dimensionality Reduction with t-SNE
tsne = TSNE(n_components=2, random_state=0, perplexity=5, init='pca')  # Set perplexity to 5 and init to PCA for stability
tsne_features = tsne.fit_transform(reduced_features)

# Step 10: KMeans Clustering
def cluster_videos(features, num_clusters=2):
    kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init=20)  # Increase n_init to stabilize clustering
    clusters = kmeans.fit_predict(features)
    return clusters

# Step 11: DBSCAN Clustering
def cluster_with_dbscan(features, eps=0.3, min_samples=2):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(features)
    return clusters

# Step 12: Silhouette Score Evaluation
def evaluate_clustering(features, labels):
    if len(np.unique(labels)) > 1:  # Ensure there are multiple clusters
        score = silhouette_score(features, labels)
    else:
        score = -1  # Invalid score if there's only 1 cluster
    return score

# Step 13: Finding Optimal Number of Clusters (K)
def find_best_k(features, min_k=2, max_k=10):
    best_k = min_k
    best_score = -1
    for k in range(min_k, max_k):
        clusters = KMeans(n_clusters=k, random_state=0, n_init=20).fit_predict(features)
        score = silhouette_score(features, clusters)
        if score > best_score:
            best_k = k
            best_score = score
    return best_k, best_score

# Finding the best K
best_k, best_score = find_best_k(tsne_features, min_k=2, max_k=5)
print(f'Best K: {best_k}, Best Silhouette Score: {best_score}')

# Final Clustering with KMeans
kmeans_clusters = cluster_videos(tsne_features, num_clusters=best_k)

# Optional: Use DBSCAN and check its silhouette score
dbscan_clusters = cluster_with_dbscan(tsne_features, eps=0.3, min_samples=2)
if len(np.unique(dbscan_clusters)) > 1:
    dbscan_silhouette = evaluate_clustering(tsne_features, dbscan_clusters)
    print(f'DBSCAN Silhouette Score: {dbscan_silhouette}')
else:
    print("DBSCAN resulted in a single cluster, silhouette score cannot be calculated.")

# Optional: Agglomerative Clustering
agg_clustering = AgglomerativeClustering(n_clusters=best_k)
agg_clusters = agg_clustering.fit_predict(tsne_features)
agg_silhouette = evaluate_clustering(tsne_features, agg_clusters)
print(f'Agglomerative Clustering Silhouette Score: {agg_silhouette}')

# Assign clusters to DataFrame and display
df_videos['cluster'] = kmeans_clusters
print(df_videos[['title', 'cluster']])

# Final Silhouette Score after clustering
final_silhouette = evaluate_clustering(tsne_features, df_videos['cluster'])
print(f'Final Silhouette Score: {final_silhouette}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Best K: 3, Best Silhouette Score: 0.38857606053352356
DBSCAN resulted in a single cluster, silhouette score cannot be calculated.
Agglomerative Clustering Silhouette Score: 0.38857606053352356
                                               title  cluster
0  The Best of Toddler Fun Learning | Learning Vi...        1
1  Best Learning Videos For Kids | The Dr.Binocs ...        2
2  Blippi Learns Colors &amp; Letters For Kids Wi...        2
3  What is ACID RAIN? | Acid Rain | Dr Binocs Sho...        1
4  Blippi Learns the 5 Senses at a Play Place | B...        0
5  Kindergarten Learning Videos | Phonics for Kid...        1
6  Best Learning Videos for Toddlers | Learning C...        2
7  Learning Collection by Brain Candy TV |Vol 1| ...        0
8  Best Duck Song collection | 7 Little Ducks, AB...        0
9  Science Experiments for Kids With Meekah | Edu...        1
Final Silhouette Score: 0.38857606053352356


In [6]:
!pip install gradio transformers

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.114.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

In [7]:
import gradio as gr
import numpy as np
import cv2
import librosa
import tempfile
from moviepy.editor import VideoFileClip
from transformers import pipeline

# Load your model
# Replace with the actual model ID you want to use
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

def extract_audio_from_video(video_file):
    # Temporary file for audio
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        audio_file_path = temp_audio_file.name

    # Extract audio from video
    video_clip = VideoFileClip(video_file.name)
    audio_clip = video_clip.audio
    audio_clip.write_audiofile(audio_file_path)

    video_clip.close()
    audio_clip.close()
    return audio_file_path

def extract_audio_features(audio_file):
    y, sr = librosa.load(audio_file, duration=60)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spectrogram_db.mean(axis=1)

def classify_video(video_file):
    audio_file = extract_audio_from_video(video_file)
    audio_features = extract_audio_features(audio_file)

    # For this example, we'll assume the model expects a text input
    # Normally you need a model that can handle audio features directly
    text_input = "This is a sample text"  # Placeholder text input
    result = classifier(text_input)

    # Example output processing
    labels = result[0]['label']
    scores = result[0]['score']

    # Convert to percentage
    if labels == "LABEL_0":  # Adjust based on actual labels
        education_percentage = scores * 100
        entertainment_percentage = 100 - education_percentage
    else:
        entertainment_percentage = scores * 100
        education_percentage = 100 - entertainment_percentage

    return f"Education: {education_percentage:.2f}%\nEntertainment: {entertainment_percentage:.2f}%"

interface = gr.Interface(fn=classify_video, inputs="file", outputs="text")
interface.launch(share=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://0501ff44052fb8d746.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


