In [1]:
!pip install nltk scikit-learn joblib



In [4]:
import json
import os
import joblib
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Download stopwords for preprocessing (only once)
nltk.download("stopwords")
nltk.download("punkt")

# Load stopwords
STOPWORDS = set(stopwords.words("english"))

# Step 1: Load JSON Data
def load_json(json_file):
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

# Step 2: Preprocess Text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in STOPWORDS]  # Remove stopwords
    return " ".join(tokens)  # Convert back to string

# Step 3: Extract Choices & Text
def extract_data(json_data):
    texts = []
    labels = []
    
    for item in json_data:
        text = item["data"].get("text", "")
        annotations = item.get("annotations", [])

        for annotation in annotations:
            for result in annotation.get("result", []):
                choices = result["value"].get("choices", [])
                if choices:
                    texts.append(preprocess_text(text))  # Preprocess before training
                    labels.append(choices[0])  # Assuming single-choice selection
    print(texts[10],labels[10])
    return texts, labels

# Step 4: Train Model
def train_model(texts, labels):
    model = make_pipeline(TfidfVectorizer(), MultinomialNB())  # Text processing + ML model
    model.fit(texts, labels)  # Train on extracted data
    joblib.dump(model, "text_classifier.pkl")  # Save trained model
    print("✅ Model trained and saved as text_classifier.pkl")

# Step 5: Predict for New Text File
def predict_choice(text_file):
    if not os.path.exists("text_classifier.pkl"):
        print("❌ Model not found. Train the model first.")
        return
    
    with open(text_file, "r", encoding="utf-8") as file:
        new_text = file.read()

    model = joblib.load("text_classifier.pkl")  # Load trained model
    processed_text = preprocess_text(new_text)  # Preprocess before prediction
    prediction = model.predict([processed_text])[0]
    print(f"🔹 Predicted Choice: {prediction}")

# ======= Execution Starts Here =======
if __name__ == "__main__":
    # 1. Load JSON from a file
    json_data = load_json("annotations.json")  # Replace with your actual JSON file

    # 2. Extract text & choices
    texts, labels = extract_data(json_data)

    if texts and labels:
        # 3. Train the model
        train_model(texts, labels)

        # 4. Predict choice for a new text file
        predict_choice("new_text.txt")  # Replace with the actual text file path
    else:
        print("❌ No valid data found in JSON.")

[nltk_data] Downloading package stopwords to C:\Users\Sanskruti
[nltk_data]     Jajoo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sanskruti
[nltk_data]     Jajoo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


film producer person oversees film production either employed production company working independently producers plan coordinate various aspects film production selecting script coordinating writing directing editing arranging financing producer responsible finding selecting promising material development unless film based existing script producer hires screenwriter oversees scripts development activities culminate pitch led producer secure financial backing enables production begin succeeds project greenlit producer supervises preproduction principal photography postproduction stages filmmaking producer hires director film well key crew members whereas director makes creative decisions production producer typically manages logistics business operations though directors also produce films producer must ensure film delivered time within budget later stages release oversee marketing distribution film producers always supervise production case primary producer executive producer may hire 