In [4]:
import re
import json
import pdfplumber
from docx import Document
import pandas as pd

In [5]:
keywords = {
    "violence": ["—É–¥–∞—Ä–∏–ª", "–∫—Ä–æ–≤—å", "—É–±–∏–ª", "—Å—Ç—Ä–µ–ª—è–µ—Ç", "–ø–∏—Å—Ç–æ–ª–µ—Ç", "–¥—Ä–∞–∫–∞", "–Ω–æ–∂", "—Ç—Ä—É–ø"],
    "sexual": ["–ø–æ—Ü–µ–ª–æ–≤–∞–ª", "—Ä–∞–∑–¥–µ–ª–∞—Å—å", "–ø–æ—Å—Ç–µ–ª—å", "—ç—Ä–æ—Ç", "—Å–µ–∫—Å", "–∏–Ω—Ç–∏–º"],
    "profanity": ["—á–µ—Ä—Ç", "–±–ª–∏–Ω", "—Å—É–∫–∞", "–≥–∞–¥", "–¥–µ—Ä—å–º–æ", "–ø–∞–¥–ª–∞", "–µ–±", "—Ö–µ—Ä"],
    "alcohol_drugs": ["–≤–æ–¥–∫–∞", "–ø—å—è–Ω—ã–π", "–∞–ª–∫–æ–≥–æ–ª—å", "–Ω–∞—Ä–∫–æ—Ç–∏–∫", "–∫–æ—Å—è–∫", "–≤–∏—Å–∫–∏", "–∫—É—Ä–∏—Ç"],
    "scary": ["–∫—Ä–∏—á–∏—Ç", "—Ç—Ä—É–ø", "–º–æ–Ω—Å—Ç—Ä", "—Å—Ç—Ä–∞—à–Ω–æ", "–∫—Ä–∏–∫", "–∫—Ä–æ–≤—å", "—Ç–µ–Ω—å"]
}

In [6]:
def read_file(path):
    if path.endswith(".pdf"):
        text = ""
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        return text
    elif path.endswith(".docx"):
        doc = Document(path)
        return "\n".join([p.text for p in doc.paragraphs])
    else:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()

In [7]:
def split_scenes(text):
    parts = re.split(r'(–°–¶–ï–ù–ê\s*\d*\.|INT\.|EXT\.|–ò–ù–¢\.|–≠–ö–°–¢\.)', text, flags=re.IGNORECASE)
    scenes = []
    current = ""
    for part in parts:
        if re.match(r'(–°–¶–ï–ù–ê|INT|EXT|–ò–ù–¢|–≠–ö–°–¢)', part, flags=re.IGNORECASE):
            if current.strip():
                scenes.append(current.strip())
            current = part
        else:
            current += " " + part
    if current.strip():
        scenes.append(current.strip())
    return [s for s in scenes if len(s.split()) > 3]

# ==== –∞–Ω–∞–ª–∏–∑ –æ–¥–Ω–æ–π —Å—Ü–µ–Ω—ã ====
def analyze_scene(scene):
    result = {}
    scene_lower = scene.lower()
    for category, words in keywords.items():
        count = sum(w in scene_lower for w in words)
        if count == 0:
            severity = "None"
        elif count == 1:
            severity = "Mild"
        elif 2 <= count <= 3:
            severity = "Moderate"
        else:
            severity = "Severe"
        result[category] = {"count": count, "severity": severity}
    return result

# ==== –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –∏—Ç–æ–≥–æ–≤–æ–≥–æ —Ä–µ–π—Ç–∏–Ω–≥–∞ ====
def get_age_rating(scene_results):
    severity_map = {"None": 0, "Mild": 1, "Moderate": 2, "Severe": 3}
    max_score = 0
    for scene in scene_results:
        for cat, data in scene.items():
            score = severity_map[data["severity"]]
            if score > max_score:
                max_score = score
    if max_score == 0:
        return "0+"
    elif max_score == 1:
        return "12+"
    elif max_score == 2:
        return "16+"
    else:
        return "18+"

# ==== –∑–∞–ø—É—Å–∫ –∞–Ω–∞–ª–∏–∑–∞ ====
def analyze_script(path):
    text = read_file(path)
    scenes = split_scenes(text)
    results = []
    for scene in scenes:
        res = analyze_scene(scene)
        results.append(res)
    age_rating = get_age_rating(results)

    # —Å–±–æ—Ä –æ—Ç—á–µ—Ç–∞
    report = []
    for i, scene in enumerate(scenes):
        entry = {"scene_id": i + 1, "text": scene}
        entry.update({k: v["severity"] for k, v in results[i].items()})
        report.append(entry)

    df = pd.DataFrame(report)
    df.to_csv("scene_analysis.csv", index=False)
    with open("report.json", "w", encoding="utf-8") as f:
        json.dump({"age_rating": age_rating, "scenes": report}, f, ensure_ascii=False, indent=2)

    print(f"\n‚úÖ –ò—Ç–æ–≥–æ–≤—ã–π –≤–æ–∑—Ä–∞—Å—Ç–Ω–æ–π —Ä–µ–π—Ç–∏–Ω–≥: {age_rating}")
    print(f"üìä –°–æ—Ö—Ä–∞–Ω–µ–Ω—ã —Ñ–∞–π–ª—ã: report.json, scene_analysis.csv")

# ==== –∑–∞–ø—É—Å–∫ ====
if __name__ == "__main__":
    path = input("–í–≤–µ–¥–∏—Ç–µ –ø—É—Ç—å –∫ —Å—Ü–µ–Ω–∞—Ä–∏—é (.txt/.pdf/.docx): ").strip()
    analyze_script(path)


‚úÖ –ò—Ç–æ–≥–æ–≤—ã–π –≤–æ–∑—Ä–∞—Å—Ç–Ω–æ–π —Ä–µ–π—Ç–∏–Ω–≥: 16+
üìä –°–æ—Ö—Ä–∞–Ω–µ–Ω—ã —Ñ–∞–π–ª—ã: report.json, scene_analysis.csv
