In [None]:
#train-data import
import pandas as pd

train_data = pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt', sep=':::', names=['movie','genre', 'plot'])
test_data = pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt', sep=':::', names=['movie','plot'])
train_data.head()
#test_data.head()

In [None]:
#cleaning plot/summary data
import re

def clean_text(text):
    text = text.lower()
    # Remove special characters and punctuations
    text = re.sub(r"[^a-z0-9\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_data['clean_plot'] = train_data['plot'].apply(clean_text)
train_data[['movie', 'genre', 'clean_plot']].head(10)  
test_data['clean_plot'] = test_data['plot'].apply(clean_text)
test_data[['movie', 'clean_plot']].head(10)  


In [None]:
#label vectorization 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

X_train = vectorizer.fit_transform(train_data['clean_plot'])
X_test = vectorizer.transform(test_data['clean_plot'])


In [None]:
#encoded genre lables
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['genre'])


In [None]:
#model training
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)



In [None]:
#prediction on test_data
y_pred = model.predict(X_test)
predicted_genres = label_encoder.inverse_transform(y_pred)
test_data['predicted_genre'] = predicted_genres



In [None]:
test_data[['movie','plot','predicted_genre']].head()


In [None]:
#train_data['genre'].value_counts()
test_data[['movie', 'plot', 'predicted_genre']].to_csv('predicted_genres.csv', index=False)


In [None]:
#testing accuracy on train_data by splitting
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the training data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Train on 80%
model.fit(X_train_split, y_train_split)

# Predict on 20% validation set
y_val_pred = model.predict(X_val_split)

# Accuracy and report
acc = accuracy_score(y_val_split, y_val_pred)
print(f"Validation Accuracy: {acc:.2f}")

print("\nClassification Report:")
print(classification_report(y_val_split, y_val_pred, target_names=label_encoder.classes_))
