####Kidoos Tales Ai

###1.Explore Or Exploit Model





In [2]:
!pip install pandas numpy tqdm pyarrow



In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta


####**Generate Demo Data**

######**Note:** This notebook uses **demo data** for illustration purposes.  
In the project , the model is fed with **live production data** to make predictions.



######I created a table called video_log that follows: {
    "event_type": "string",
    "user_id": "string",
    "video_id": "string",
    "watch_time": "int",
    "completion_rate": "int",
    "like": "int",
    "comment": "int",
    "share": "int",
    "session_position": "string",
    "action": "string",
    "created_at": "string"
}


In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta

NUM_ROWS = 1_000_000
BATCH_SIZE = 100_000

NUM_USERS = 100_000
NUM_VIDEOS = 50_000

np.random.seed(42)
START_DATE = datetime(2025, 1, 1)

users = [f"U{i}" for i in range(NUM_USERS)]
videos = [f"V{i}" for i in range(NUM_VIDEOS)]

user_engagement = np.clip(np.random.normal(0.6, 0.15, NUM_USERS), 0.2, 0.95)
video_quality = np.clip(np.random.beta(2, 5, NUM_VIDEOS), 0.1, 1.0)

def generate_batch(batch_size, start_time):
    rows = []
    current_time = start_time

    for _ in range(batch_size):
        u_idx = np.random.randint(NUM_USERS)
        v_idx = np.random.randint(NUM_VIDEOS)

        action = np.random.choice([0, 1], p=[0.35, 0.65])  # SARSA

        base_eng = user_engagement[u_idx]
        quality = video_quality[v_idx]

        session_position = np.random.randint(1, 15)

        watch_bias = 0.4 if action == 0 else 0.7
        watch_ratio = np.clip(
            np.random.normal(watch_bias * base_eng * quality, 0.15),
            0,
            1
        )

        rows.append({
            "event_type": "VIDEO_WATCH",
            "user_id": users[u_idx],
            "video_id": videos[v_idx],
            "watch_time": int(watch_ratio * np.random.randint(60, 300)),
            "completion_rate": round(watch_ratio, 2),
            "like": watch_ratio > 0.7 and np.random.rand() < 0.7,
            "comment": watch_ratio > 0.6 and np.random.rand() < 0.4,
            "share": watch_ratio > 0.8 and np.random.rand() < 0.3,
            "skipped": watch_ratio < 0.2,
            "session_position": session_position,
            "action": action,
            "created_at": current_time
        })

        current_time += timedelta(seconds=np.random.randint(5, 40))

    return pd.DataFrame(rows), current_time


In [5]:
current_time = START_DATE
dfs = []

for _ in tqdm(range(NUM_ROWS // BATCH_SIZE)):
    df_batch, current_time = generate_batch(BATCH_SIZE, current_time)
    dfs.append(df_batch)

df = pd.concat(dfs, ignore_index=True)

df.to_parquet("video_event_logs_1M.parquet", index=False)

print(df.shape)
df.head()


100%|██████████| 10/10 [01:17<00:00,  7.75s/it]


(1000000, 12)


Unnamed: 0,event_type,user_id,video_id,watch_time,completion_rate,like,comment,share,skipped,session_position,action,created_at
0,VIDEO_WATCH,U10601,V22052,0,0.0,False,False,False,True,11,0,2025-01-01 00:00:00
1,VIDEO_WATCH,U56936,V25479,27,0.1,False,False,False,True,10,1,2025-01-01 00:00:20
2,VIDEO_WATCH,U99661,V9406,38,0.27,False,False,False,False,10,1,2025-01-01 00:00:54
3,VIDEO_WATCH,U11971,V49499,2,0.01,False,False,False,True,14,0,2025-01-01 00:01:31
4,VIDEO_WATCH,U57253,V689,28,0.11,False,False,False,True,13,1,2025-01-01 00:02:02


In [6]:
import pandas as pd
df = pd.read_parquet("video_event_logs_1M.parquet")

In [7]:
df["action"].value_counts(normalize=True)
df[["completion_rate", "watch_time"]].describe()
df[["like", "comment", "share", "skipped"]].mean()


Unnamed: 0,0
like,0.000234
comment,0.000754
share,1.2e-05
skipped,0.726423


#####I consider 20 states:
1.   Completion rate is from 0 to 9 (as a probability)
2.   Session position starts at login and increases with each video. I divide the  session position by 2 and cap it at 9

In [8]:
NUM_STATES = 19
NUM_ACTIONS = 2

Q = np.zeros((NUM_STATES, NUM_ACTIONS))
alpha = 0.1
gamma = 0.9
epsilon = 1.0
EPS_DECAY = 0.995
EPS_MIN = 0.05

def discretize_state(row):
    comp_bin = int(row.completion_rate * 10)
    pos_bin = min(row.session_position // 2, 9)
    return comp_bin + pos_bin


In [9]:
EXPLORE = 0   # try new / risky video
EXPLOIT = 1   # safe / high quality video

In [10]:
def simulate_user_response(action, row):
    if action == EXPLOIT:
        completion = np.clip(np.random.normal(0.75, 0.1), 0, 1)
        like = completion > 0.6
        comment = completion > 0.8
        share = completion > 0.85
        skipped = completion < 0.2
    else:  # EXPLORE
        completion = np.clip(np.random.normal(0.45, 0.25), 0, 1)
        like = completion > 0.7
        comment = completion > 0.85
        share = completion > 0.9
        skipped = completion < 0.3

    watch_time = int(row.watch_time * completion)

    return {
        "completion": completion,
        "watch_time": watch_time,
        "like": like,
        "comment": comment,
        "share": share,
        "skipped": skipped
    }


In [11]:
def reward_from_sim(out):
    r = out["completion"]

    if out["like"]: r += 0.3
    if out["comment"]: r += 0.5
    if out["share"]: r += 0.8
    if out["skipped"]: r -= 0.7

    return np.clip(r, -1, 2)


In [12]:
def softmax_action(q_values, temperature=1.0):
    """
    Selects an action using Softmax (Boltzmann exploration)
    """
    q_values = np.array(q_values)

    # Avoid division by zero
    temperature = max(temperature, 1e-5)

    exp_q = np.exp(q_values / temperature)
    probs = exp_q / np.sum(exp_q)

    return np.random.choice(len(q_values), p=probs)


In [13]:
for epoch in range(5):
    total_reward = 0

    for i in range(len(df) - 1):
        row = df.iloc[i]

        s = discretize_state(row)

        # Softmax action selection
        a = softmax_action(Q[s], temperature=epsilon)

        # Environment response
        out = simulate_user_response(a, row)
        r = reward_from_sim(out)

        s_next = discretize_state(row)

        # Next action (SARSA)
        a_next = softmax_action(Q[s_next], temperature=epsilon)

        # SARSA update
        Q[s, a] += alpha * (
            r + gamma * Q[s_next, a_next] - Q[s, a]
        )

        total_reward += r

    # decay temperature
    epsilon = max(EPS_MIN, epsilon * EPS_DECAY)
    print(f"Epoch {epoch} | Reward {total_reward:.2f} | Temp={epsilon:.3f}")



Epoch 0 | Reward 1123352.27 | Temp=0.995
Epoch 1 | Reward 1042299.01 | Temp=0.990
Epoch 2 | Reward 986958.53 | Temp=0.985
Epoch 3 | Reward 980811.14 | Temp=0.980
Epoch 4 | Reward 979823.36 | Temp=0.975


###Recommand Engine with Cosine Similarity

In [14]:
!pip install sentence-transformers tensorflow --quiet

In [15]:
import tensorflow as tf
import numpy as np
from sentence_transformers import SentenceTransformer

In [20]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
videos = [
    {"video_id": 1, "prompt": "A brave Indian hero teaching honesty to kids"},
    {"video_id": 2, "prompt": "A space adventure story for children"},
    {"video_id": 3, "prompt": "Village story about kindness and sharing"},
    {"video_id": 4, "prompt": "Robot and AI learning story for kids"},
    {"video_id": 5, "prompt": "Historical story inspired by Mahatma Gandhi"}
]

In [22]:
def load_videos_from_db():
    texts = [v["prompt"] for v in videos]
    embeddings = sentence_model.encode(texts, convert_to_numpy=True)
    embeddings_tf = tf.convert_to_tensor(embeddings, dtype=tf.float32)
    videos_norm = tf.nn.l2_normalize(embeddings_tf, axis=1)
    return videos, videos_norm

In [23]:
def recommend_videos(user_query, videos_norm, top_n=3):
    query_emb = sentence_model.encode([user_query], convert_to_numpy=True)
    query_emb_tf = tf.convert_to_tensor(query_emb, dtype=tf.float32)
    query_norm = tf.nn.l2_normalize(query_emb_tf, axis=1)

    similarity = tf.matmul(videos_norm, query_norm, transpose_b=True)
    similarity = tf.squeeze(similarity, axis=1)

    top_indices = tf.argsort(similarity, direction="DESCENDING")[:top_n]

    return top_indices.numpy().tolist(), similarity.numpy().tolist()

In [24]:
user_query = "Indian freedom fighter story for kids with moral values"

videos_db, videos_norm = load_videos_from_db()
top_indices, similarity_scores = recommend_videos(user_query, videos_norm, top_n=3)

print("User Query:")
print(user_query)
print("\nTop Recommended Videos:\n")

for idx in top_indices:
    print(f"Video ID: {videos_db[idx]['video_id']}")
    print(f"Prompt : {videos_db[idx]['prompt']}")
    print(f"Score  : {round(similarity_scores[idx], 4)}")
    print("-" * 50)


User Query:
Indian freedom fighter story for kids with moral values

Top Recommended Videos:

Video ID: 1
Prompt : A brave Indian hero teaching honesty to kids
Score  : 0.7021
--------------------------------------------------
Video ID: 5
Prompt : Historical story inspired by Mahatma Gandhi
Score  : 0.4365
--------------------------------------------------
Video ID: 3
Prompt : Village story about kindness and sharing
Score  : 0.3589
--------------------------------------------------


###Toxic, Sexual, and Hate Speech Detection

In [29]:
!pip install scikit-learn pandas numpy joblib emoji nltk contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m 

In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import contractions
import re
import emoji
import nltk

In [28]:
# df = pd.read_csv("/content/train.csv", engine='python', on_bad_lines='skip')
# df

!pip install -q gdown
import gdown
import pandas as pd

file_id = "15nFuNlX-0kKMhBH8WyKmToVkDQPvJa0X"
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "train.csv", quiet=False)

df = pd.read_csv("train.csv", engine="python", on_bad_lines="skip")
df.head()


Downloading...
From: https://drive.google.com/uc?id=15nFuNlX-0kKMhBH8WyKmToVkDQPvJa0X
To: /content/train.csv
100%|██████████| 68.8M/68.8M [00:01<00:00, 54.9MB/s]


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [31]:
labels = [
    "toxic",
    "severe_toxic",
    "obscene",
    "threat",
    "insult",
    "identity_hate"
]


In [32]:
import nltk
nltk.download('stopwords')
stop_words=set(nltk.corpus.stopwords.words('english'))
vectorizer = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 2),
    stop_words="english"
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [33]:
# Preprocess the script
def preprocess_script(text: str) -> str:
  text = text.lower()
  text = contractions.fix(text)
  text = re.sub(r'http\S+|www\S+', '', text)
  text = re.sub(r'<.*?>', '', text)
  text = emoji.replace_emoji(text, replace='')
  text = re.sub(r'[^a-z\s]', '', text)
  text = re.sub(r'\s+', ' ', text).strip()
  words = text.split()
  words = [w for w in words if w not in stop_words]
  return " ".join(words)


In [34]:
df_new=df
df_new["comment_text"]=df_new["comment_text"].apply(preprocess_script)
y = df[[
    "toxic",
    "severe_toxic",
    "obscene",
    "threat",
    "insult",
    "identity_hate"
]]

In [35]:
X_vec = vectorizer.fit_transform(df_new['comment_text'])

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(
    LogisticRegression(
        solver="liblinear",
        max_iter=1000
    )
)

model.fit(X_vec, y)


In [37]:
#model check
def check_script(script: str) -> str:
  text=preprocess_script(script)
  X = vectorizer.transform([text])
  prediction = model.predict(X)
  prediction = prediction.ravel()
  result = [labels[i] for i, val in enumerate(prediction) if val == 1]
  return result

print("check 1",check_script("I love you"))
print("check 2",check_script("I hate you"))
print("check 3",check_script("I am a terrorist"))


check 1 []
check 2 ['toxic']
check 3 []


###Audio generation code using openai

###Prompt for story

In [47]:
def build_5_story_prompt(customer) -> str:
    return f"""
You are a STRICT JSON GENERATION ENGINE.
TASK: Generate EXACTLY 5 short stories for video narration.
AGE: Stories must be suitable for a {customer.age}-year-old child.

STORY REQUIREMENTS:
- ~2 minutes long when narrated
- Simple, engaging language
- Immersive and visual
- Naturally teaches the educational goal
- Child-safe content only

STORY DETAILS:
Hero name: {customer.hero_name}
Interests: {customer.interests}
World setting: {customer.world_setting}
Educational goal: {customer.educational_goals}

OUTPUT RULES (MANDATORY):
- Return ONLY valid JSON
- Do NOT include explanations
- Do NOT include markdown
- Do NOT include extra text
- Output MUST start with '[' and end with ']'
- Use double quotes for all strings

OUTPUT FORMAT (EXACT – DO NOT CHANGE):
[
  {{
    "title": "string",
    "story": "string",
    "genre": "string"
  }}
]
"""


In [48]:
class Customer:
    def __init__(self, age, hero_name, interests, world_setting, educational_goals):
        self.age = age
        self.hero_name = hero_name
        self.interests = interests
        self.world_setting = world_setting
        self.educational_goals = educational_goals


In [None]:
customer = Customer(
    age=7,
    hero_name="Aarav",
    interests=["space", "robots", "adventures"],
    world_setting="futuristic space city",
    educational_goals="curiosity and problem-solving"
)

prompt=build_5_story_prompt(customer)

print(prompt)



You are a STRICT JSON GENERATION ENGINE.
TASK: Generate EXACTLY 5 short stories for video narration.
AGE: Stories must be suitable for a 7-year-old child.

STORY REQUIREMENTS:
- ~2 minutes long when narrated
- Simple, engaging language
- Immersive and visual
- Naturally teaches the educational goal
- Child-safe content only

STORY DETAILS:
Hero name: Aarav
Interests: ['space', 'robots', 'adventures']
World setting: futuristic space city
Educational goal: curiosity and problem-solving

OUTPUT RULES (MANDATORY):
- Return ONLY valid JSON
- Do NOT include explanations
- Do NOT include markdown
- Do NOT include extra text
- Output MUST start with '[' and end with ']'
- Use double quotes for all strings

OUTPUT FORMAT (EXACT – DO NOT CHANGE):
[
  {
    "title": "string",
    "story": "string",
    "genre": "string"
  }
]



###Used to generate stories and save them to the database using the SARSA algorithm.

####To run this code need open ai key please add to run

In [65]:
!pip install -q openai gTTS moviepy


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
os.environ["OPENAI_API"] = "PASTE_YOUR__HERE"

In [None]:
from openai import OpenAI
from dataclasses import dataclass
from typing import List
import json
import os 
OPENAI_API_KEY = os.environ.get("OPENAI_API")
client = OpenAI(api_key=OPENAI_API_KEY)

In [61]:
def call_openai(prompt: str):
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # fast + cheap
        messages=[
            {"role": "system", "content": "You generate only valid JSON."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7
    )
    return response.choices[0].message.content


In [63]:
import json
import re

def extract_json_array(text: str):
    """
    Extracts the LAST valid JSON array from text.
    """
    matches = re.findall(r"\[[\s\S]*?\]", text)
    if not matches:
        raise ValueError("No JSON array found")
    return json.loads(matches[-1])

In [64]:
stories=call_openai(prompt)
print(stories)
parsed_stories = extract_json_array(stories)
print(len(parsed_stories))
print(parsed_stories[0]["title"])

[
  {
    "title": "Aarav and the Missing Stars",
    "story": "In the futuristic city of Starville, Aarav gazed at the night sky filled with twinkling stars. One evening, he noticed that some stars were missing! Curious, he decided to investigate. He built a small robot named Spark to help him. Together, they explored the city and asked the wise old Moon about the missing stars. The Moon told them that the stars had been trapped in a swirling cloud. Aarav and Spark created a powerful light beam to break the cloud. With teamwork and clever thinking, they freed the stars, lighting up the sky once more. Aarav learned that curiosity leads to adventure and solving problems can be fun!",
    "genre": "Adventure"
  },
  {
    "title": "Aarav's Robot Race",
    "story": "In Starville, Aarav loved robots and decided to enter a robot race. He built a speedy robot named Zoom. On race day, many robots lined up, each with unique abilities. Aarav felt nervous but remembered to stay curious. When th

###Generate Audio

In [66]:
from gtts import gTTS
from moviepy.editor import AudioFileClip
from IPython.display import Audio

def generate_audio_from_story(story_dict):
    # Clean title for file name
    file_name = "-".join(story_dict["title"].lower().split()) + ".mp3"

    # Generate audio using gTTS
    tts = gTTS(text=story_dict["story"], lang="en")
    tts.save(file_name)

    # Get duration
    audio_clip = AudioFileClip(file_name)
    duration = int(audio_clip.duration)
    audio_clip.close()

    return {
        "status": "success",
        "audio_path": file_name,
        "duration_sec": duration
    }

# Example usage

result = generate_audio_from_story(parsed_stories[0])
print(result)

# Play the audio
Audio(result["audio_path"])


  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"
  lines_video = [l for l in lines if ' Video: ' in l and re.search('\d+x\d+', l)]
  rotation_lines = [l for l in lines if 'rotate          :' in l and re.search('\d+$', l)]
  match = re.search('\d+$', rotation_line)
  if event.key is 'enter':



{'status': 'success', 'audio_path': 'aarav-and-the-missing-stars.mp3', 'duration_sec': 47}


###Demo Project Run

In [None]:
class Customer:
    def __init__(self, age, hero_name, interests, world_setting, educational_goals):
        self.age = age
        self.hero_name = hero_name
        self.interests = interests
        self.world_setting = world_setting
        self.educational_goals = educational_goals

customer = Customer(
    age=7,
    hero_name="Aarav",
    interests=["space", "robots", "adventures"],
    world_setting="futuristic space city",
    educational_goals="curiosity and problem-solving"
)

prompt = build_5_story_prompt(customer)

print(prompt)

In [None]:
initial_state_data = {
    'completion_rate': 0.5,
    'session_position': 1
}
initial_state_series = pd.Series(initial_state_data)

s = int(discretize_state(initial_state_series))
action = softmax_action(Q[s], temperature=epsilon)

if action == EXPLORE:
    print("Chosen Action: EXPLORE (Generate new stories)")
else:
    print("Chosen Action: EXPLOIT (Recommend existing stories)")

In [None]:
prompt = build_5_story_prompt(customer)
if action == EXPLORE:
    print("Chosen Action: EXPLORE (Generate new stories)")
    stories = call_openai(prompt)
    parsed_stories = extract_json_array(stories)
    filtered_stories = [story for story in parsed_stories if not check_script(story["story"])]
    for story in filtered_stories:
        audio_result = generate_audio_from_story(story)
elif action == EXPLOIT:
    print("Chosen Action: EXPLOIT (Recommend existing stories)")
    top_indices, similarity_scores = recommend_videos(prompt, videos_norm, top_n=3)
    for idx in top_indices:
        audio_result = generate_audio_from_story(videos_db[idx])