In [None]:
import pandas as pd
import re
from transformers import pipeline, DistilBertTokenizerFast

def parse_movies(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    
    pattern = re.compile(r'(\d+) ::: (.*?) ::: (.*?)\n')
    matches = pattern.findall(data)
    
    movies = []
    for match in matches:
        movie_id, title, description = match
        movies.append({
            'id': int(movie_id),
            'title': title,
            'description': description
        })
    
    return pd.DataFrame(movies)


file_path = 'test_data.txt'  
movies_df = parse_movies(file_path)

classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def classify_genre(description):

    tokens = tokenizer(description, truncation=True, max_length=512, return_tensors='pt')

    truncated_description = tokenizer.decode(tokens.input_ids[0], skip_special_tokens=True)

    result = classifier(truncated_description)

    if result[0]['label'] == 'POSITIVE':
        return 'drama'
    else:
        return 'thriller'

movies_df['genre'] = movies_df['description'].apply(classify_genre)

def format_output(row):
    return f"{row['id']} ::: {row['title']} ::: {row['genre']} ::: {row['description']}"

movies_df['formatted_output'] = movies_df.apply(format_output, axis=1)

output_file_path = 'classified_movies.txt'
with open(output_file_path, 'w', encoding='utf-8') as file:
    for line in movies_df['formatted_output']:
        file.write(line + '\n')

print("Classification complete. Results saved to classified_movies.txt")
