####Kidoos Tales Ai

###1.Explore Or Exploit Model





In [None]:
!pip install pandas numpy tqdm pyarrow



In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta


####**Generate Demo Data**

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta

NUM_ROWS = 1_000_000
BATCH_SIZE = 100_000

NUM_USERS = 100_000
NUM_VIDEOS = 50_000

np.random.seed(42)
START_DATE = datetime(2025, 1, 1)

users = [f"U{i}" for i in range(NUM_USERS)]
videos = [f"V{i}" for i in range(NUM_VIDEOS)]

user_engagement = np.clip(np.random.normal(0.6, 0.15, NUM_USERS), 0.2, 0.95)
video_quality = np.clip(np.random.beta(2, 5, NUM_VIDEOS), 0.1, 1.0)

def generate_batch(batch_size, start_time):
    rows = []
    current_time = start_time

    for _ in range(batch_size):
        u_idx = np.random.randint(NUM_USERS)
        v_idx = np.random.randint(NUM_VIDEOS)

        action = np.random.choice([0, 1], p=[0.35, 0.65])  # SARSA

        base_eng = user_engagement[u_idx]
        quality = video_quality[v_idx]

        session_position = np.random.randint(1, 15)

        watch_bias = 0.4 if action == 0 else 0.7
        watch_ratio = np.clip(
            np.random.normal(watch_bias * base_eng * quality, 0.15),
            0,
            1
        )

        rows.append({
            "event_type": "VIDEO_WATCH",
            "user_id": users[u_idx],
            "video_id": videos[v_idx],
            "watch_time": int(watch_ratio * np.random.randint(60, 300)),
            "completion_rate": round(watch_ratio, 2),
            "like": watch_ratio > 0.7 and np.random.rand() < 0.7,
            "comment": watch_ratio > 0.6 and np.random.rand() < 0.4,
            "share": watch_ratio > 0.8 and np.random.rand() < 0.3,
            "skipped": watch_ratio < 0.2,
            "session_position": session_position,
            "action": action,
            "created_at": current_time
        })

        current_time += timedelta(seconds=np.random.randint(5, 40))

    return pd.DataFrame(rows), current_time


In [None]:
current_time = START_DATE
dfs = []

for _ in tqdm(range(NUM_ROWS // BATCH_SIZE)):
    df_batch, current_time = generate_batch(BATCH_SIZE, current_time)
    dfs.append(df_batch)

df = pd.concat(dfs, ignore_index=True)

df.to_parquet("video_event_logs_1M.parquet", index=False)

print(df.shape)
df.head()


100%|██████████| 10/10 [01:26<00:00,  8.64s/it]


(1000000, 12)


Unnamed: 0,event_type,user_id,video_id,watch_time,completion_rate,like,comment,share,skipped,session_position,action,created_at
0,VIDEO_WATCH,U10601,V22052,0,0.0,False,False,False,True,11,0,2025-01-01 00:00:00
1,VIDEO_WATCH,U56936,V25479,27,0.1,False,False,False,True,10,1,2025-01-01 00:00:20
2,VIDEO_WATCH,U99661,V9406,38,0.27,False,False,False,False,10,1,2025-01-01 00:00:54
3,VIDEO_WATCH,U11971,V49499,2,0.01,False,False,False,True,14,0,2025-01-01 00:01:31
4,VIDEO_WATCH,U57253,V689,28,0.11,False,False,False,True,13,1,2025-01-01 00:02:02


In [None]:
import pandas as pd
df = pd.read_parquet("video_event_logs_1M.parquet")

In [None]:
df["action"].value_counts(normalize=True)
df[["completion_rate", "watch_time"]].describe()
df[["like", "comment", "share", "skipped"]].mean()


Unnamed: 0,0
like,0.000234
comment,0.000754
share,1.2e-05
skipped,0.726423


In [None]:
NUM_STATES = 20
NUM_ACTIONS = 2

Q = np.zeros((NUM_STATES, NUM_ACTIONS))
alpha = 0.1
gamma = 0.9
epsilon = 1.0
EPS_DECAY = 0.995
EPS_MIN = 0.05

def discretize_state(row):
    comp_bin = int(row.completion_rate * 10)
    pos_bin = min(row.session_position // 2, 9)
    return comp_bin + pos_bin


In [None]:
EXPLORE = 0   # try new / risky video
EXPLOIT = 1   # safe / high quality video

In [None]:
def simulate_user_response(action, row):
    if action == EXPLOIT:
        completion = np.clip(np.random.normal(0.75, 0.1), 0, 1)
        like = completion > 0.6
        comment = completion > 0.8
        share = completion > 0.85
        skipped = completion < 0.2
    else:  # EXPLORE
        completion = np.clip(np.random.normal(0.45, 0.25), 0, 1)
        like = completion > 0.7
        comment = completion > 0.85
        share = completion > 0.9
        skipped = completion < 0.3

    watch_time = int(row.watch_time * completion)

    return {
        "completion": completion,
        "watch_time": watch_time,
        "like": like,
        "comment": comment,
        "share": share,
        "skipped": skipped
    }


In [None]:
def reward_from_sim(out):
    r = out["completion"]

    if out["like"]: r += 0.3
    if out["comment"]: r += 0.5
    if out["share"]: r += 0.8
    if out["skipped"]: r -= 0.7

    return np.clip(r, -1, 2)


In [None]:
for epoch in range(5):
    total_reward = 0

    for i in range(len(df) - 1):
        row = df.iloc[i]

        s = discretize_state(row)

        # choose action
        if np.random.rand() < epsilon:
            a = np.random.randint(NUM_ACTIONS)
        else:
            a = np.argmax(Q[s])

        # ENVIRONMENT RESPONDS
        out = simulate_user_response(a, row)
        r = reward_from_sim(out)

        s_next = discretize_state(row)

        # next action
        if np.random.rand() < epsilon:
            a_next = np.random.randint(NUM_ACTIONS)
        else:
            a_next = np.argmax(Q[s_next])

        # SARSA update
        Q[s, a] += alpha * (
            r + gamma * Q[s_next, a_next] - Q[s, a]
        )

        total_reward += r

    epsilon = max(EPS_MIN, epsilon * EPS_DECAY)
    print(f"Epoch {epoch} | Reward {total_reward:.2f} | ε={epsilon:.3f}")


Epoch 0 | Reward 796653.17 | ε=0.975
Epoch 1 | Reward 798409.49 | ε=0.970
Epoch 2 | Reward 801201.92 | ε=0.966
Epoch 3 | Reward 803929.42 | ε=0.961
Epoch 4 | Reward 805830.71 | ε=0.956


###Recommand Engine with Cosine Similarity

In [None]:
!pip install sentence-transformers tensorflow --quiet

In [None]:
import tensorflow as tf
import numpy as np
from sentence_transformers import SentenceTransformer

In [None]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
videos = [
    {"video_id": 1, "prompt": "A brave Indian hero teaching honesty to kids"},
    {"video_id": 2, "prompt": "A space adventure story for children"},
    {"video_id": 3, "prompt": "Village story about kindness and sharing"},
    {"video_id": 4, "prompt": "Robot and AI learning story for kids"},
    {"video_id": 5, "prompt": "Historical story inspired by Mahatma Gandhi"}
]

In [None]:
def load_videos_from_db():
    texts = [v["prompt"] for v in videos]
    embeddings = sentence_model.encode(texts, convert_to_numpy=True)
    embeddings_tf = tf.convert_to_tensor(embeddings, dtype=tf.float32)
    videos_norm = tf.nn.l2_normalize(embeddings_tf, axis=1)
    return videos, videos_norm

In [None]:
def recommend_videos(user_query, videos_norm, top_n=3):
    query_emb = sentence_model.encode([user_query], convert_to_numpy=True)
    query_emb_tf = tf.convert_to_tensor(query_emb, dtype=tf.float32)
    query_norm = tf.nn.l2_normalize(query_emb_tf, axis=1)

    similarity = tf.matmul(videos_norm, query_norm, transpose_b=True)
    similarity = tf.squeeze(similarity, axis=1)

    top_indices = tf.argsort(similarity, direction="DESCENDING")[:top_n]

    return top_indices.numpy().tolist(), similarity.numpy().tolist()

In [None]:
user_query = "Indian freedom fighter story for kids with moral values"

videos_db, videos_norm = load_videos_from_db()
top_indices, similarity_scores = recommend_videos(user_query, videos_norm, top_n=3)

print("User Query:")
print(user_query)
print("\nTop Recommended Videos:\n")

for idx in top_indices:
    print(f"Video ID: {videos_db[idx]['video_id']}")
    print(f"Prompt : {videos_db[idx]['prompt']}")
    print(f"Score  : {round(similarity_scores[idx], 4)}")
    print("-" * 50)


User Query:
Indian freedom fighter story for kids with moral values

Top Recommended Videos:

Video ID: 1
Prompt : A brave Indian hero teaching honesty to kids
Score  : 0.7021
--------------------------------------------------
Video ID: 5
Prompt : Historical story inspired by Mahatma Gandhi
Score  : 0.4365
--------------------------------------------------
Video ID: 3
Prompt : Village story about kindness and sharing
Score  : 0.3589
--------------------------------------------------


###Toxic, Sexual, and Hate Speech Detection

In [None]:
!pip install scikit-learn pandas numpy joblib emoji nltk



In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import contractions
import re
import emoji
import nltk

In [None]:
df = pd.read_csv("/content/train.csv", engine='python', on_bad_lines='skip')
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
31593,53e42e3f1d0701b1,"""\n\n POTD list \n\nHowcheng, non-admins are g...",0,0,0,0,0,0
31594,53e5688208449113,"""\n\n Moving Ahead \nI just found that I am un...",0,0,0,0,0,0
31595,53e56b77c61f909d,In answer to the question you post on my talk ...,0,0,0,0,0,0
31596,53e828922c3642d6,Mysteriously fixed now. Much better.,0,0,0,0,0,0


In [None]:
labels = [
    "toxic",
    "severe_toxic",
    "obscene",
    "threat",
    "insult",
    "identity_hate"
]


In [None]:
import nltk
nltk.download('stopwords')
stop_words=set(nltk.corpus.stopwords.words('english'))
vectorizer = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 2),
    stop_words="english"
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Preprocess the script
def preprocess_script(text: str) -> str:
  text = text.lower()
  text = contractions.fix(text)
  text = re.sub(r'http\S+|www\S+', '', text)
  text = re.sub(r'<.*?>', '', text)
  text = emoji.replace_emoji(text, replace='')
  text = re.sub(r'[^a-z\s]', '', text)
  text = re.sub(r'\s+', ' ', text).strip()
  words = text.split()
  words = [w for w in words if w not in stop_words]
  return " ".join(words)


In [None]:
df_new=df
df_new["comment_text"]=df_new["comment_text"].apply(preprocess_script)
y = df[[
    "toxic",
    "severe_toxic",
    "obscene",
    "threat",
    "insult",
    "identity_hate"
]]

In [None]:
X_vec = vectorizer.fit_transform(df_new['comment_text'])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(
    LogisticRegression(
        solver="liblinear",
        max_iter=1000
    )
)

model.fit(X_vec, y)


In [None]:
#model check
def check_script(script: str) -> str:
  text=preprocess_script(script)
  X = vectorizer.transform([text])
  prediction = model.predict(X)
  prediction = prediction.ravel()
  result = [labels[i] for i, val in enumerate(prediction) if val == 1]
  return result

print("check 1",check_script("I love you"))
print("check 2",check_script("I hate you"))
print("check 3",check_script("I am a terrorist"))


check 1 []
check 2 ['toxic']
check 3 []


###Audio generation code using openai

###Prompt for story

In [None]:
def build_5_story_prompt(self,customer: CustomerDetails) -> str:
        return f"""
        You are a STRICT JSON GENERATION ENGINE.
        TASK: Generate EXACTLY 5 short stories for video narration.
        AGE:Stories must be suitable for a {customer.age}-year-old child.
        STORY REQUIREMENTS:
        - ~2 minutes long when narrated
        - Simple, engaging language
        - Immersive and visual
        - Naturally teaches the educational goal
        - Child-safe content only
        STORY DETAILS:
        Hero name: {customer.hero_name}
        Interests: {customer.interests}
        World setting: {customer.world_setting}
        Educational goal: {customer.educational_goals}
        OUTPUT RULES (MANDATORY):
        - Return ONLY valid JSON
        - Do NOT include explanations
        - Do NOT include markdown
        - Do NOT include extra text
        - Output MUST start with '[' and end with ']'
        - Use double quotes for all strings
        OUTPUT FORMAT (EXACT – DO NOT CHANGE):
        [
        {{
            "title": "string",
            "story": "string",
            "genre": "string"}}
            ]
        """

###Used to generate stories and save them to the database using the SARSA algorithm.

In [None]:
 def generate_story(self, user_id: str, title: str, script: str, genre: str):
        # 1. Unique ID generate karein jo har jagah use hogi
        vid_id = self._generate_clean_id(title)
        base_dir = self.base_dir / "videos" / vid_id
        base_dir.mkdir(parents=True, exist_ok=True)
        audio_path = base_dir / f"{vid_id}.mp3"

        # 2. Safety Check
        secure_results = self.check_script(script)
        if any(item in self.parent_check_list for item in secure_results):
            logger.warning(f"Safety Alert: Content flagged for {secure_results}")
            return {"error": "Inappropriate content detected", "flags": secure_results}

        try:
            # 3. Audio Generation (Using Thread for Sync gTTS)
            gTTS(text=script, lang="en").save(str(audio_path))
            audio_clip = AudioFileClip(str(audio_path))
            duration = int(audio_clip.duration)
            audio_clip.close()

            # 4. Database Me Save (Atomic Transaction)
            with transaction.atomic():
                video_record = Video.objects.create(
                    video_id=vid_id,  # Yeh ID ab Logs se match karegi
                    title=title,
                    description=script,
                    genre=genre,
                    duration_sec=duration,
                    created_by=user_id,
                    is_active=True,
                    status="generated",
                    path=str(audio_path),
                    source_type=1,
                    language="en",
                    prompt=script,
                    tag=genre

                )
                logger.info(f"Successfully saved Video: {vid_id}")

            return {"status": "success", "video_id": vid_id, "audio_path": str(audio_path)}

        except Exception as e:
            logger.error(f"Error in generate_video: {str(e)}")
            return {"error": str(e)}
