In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Read dataset files
def read_txt_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.readlines()

# Load data
train_data = read_txt_file('/mnt/data/train_data.txt')
test_data = read_txt_file('/mnt/data/test_data.txt')
test_labels = read_txt_file('/mnt/data/test_data_solution.txt')

def parse_data(data):
    plots = []
    genres = []
    for line in data:
        parts = line.strip().split('\t')  # Adjust delimiter if needed
        if len(parts) > 1:
            plots.append(parts[0])
            genres.append(parts[1].split('|'))  # Assuming multi-label genres
    return plots, genres

# Parse train and test data
train_plots, train_genres = parse_data(train_data)
test_plots, test_genres = parse_data(test_labels)  # Actual labels

# Convert genres to binary labels
all_genres = set([genre for sublist in train_genres for genre in sublist])
train_labels = pd.DataFrame([[1 if genre in genres else 0 for genre in all_genres] for genres in train_genres], columns=all_genres)
test_labels = pd.DataFrame([[1 if genre in genres else 0 for genre in all_genres] for genres in test_genres], columns=all_genres)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

# Apply preprocessing
train_plots = [preprocess_text(plot) for plot in train_plots]
test_plots = [preprocess_text(plot) for plot in test_plots]

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_plots)
X_test_tfidf = vectorizer.transform(test_plots)

# Train model
model = OneVsRestClassifier(LogisticRegression())
model.fit(X_train_tfidf, train_labels)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print(classification_report(test_labels, y_pred, target_names=list(all_genres)))