In [None]:
import os
import pandas as pd
import kagglehub

# Download dataset
path = kagglehub.dataset_download("hijest/genre-classification-dataset-imdb")
print(" Dataset downloaded at:", path)

# Check contents of the main folder
print(" Main folder contents:", os.listdir(path))

# Look inside subfolders for CSV
csv_file = None
for root, dirs, files in os.walk(path):
    for f in files:
        if f.endswith(".csv") or f.endswith(".txt"):
            csv_file = os.path.join(root, f)
            break
    if csv_file:
        break

if not csv_file:
    raise FileNotFoundError("No CSV/TXT file found in the dataset folder or subfolders.")

print(" Loading dataset from:", csv_file)
df = pd.read_csv(csv_file)

print(" Dataset loaded successfully!")
print(df)
print("Shape:", df.shape)
print(df.head())


nltk.download("stopwords")
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    tokens = text.split()
    tokens = [t for t in tokens if t.isalpha() and t.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(t.lower()) for t in tokens]
    return " ".join(tokens)

# Split the column into separate columns
df = df.iloc[:,0].str.split(' ::: ', expand=True)
df.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']

print("🧹 Cleaning text data...")
df["DESCRIPTION"] = df["DESCRIPTION"].apply(preprocess_text)

# 3. Remove Rare Genres
genre_counts = df["GENRE"].value_counts()
df = df[df["GENRE"].isin(genre_counts[genre_counts > 1].index)]
print(" Rare genres removed.")
print("Remaining unique genres:", df["GENRE"].nunique())

# 4. Encode Genre Labels
le = LabelEncoder()
df["GENRE"] = le.fit_transform(df["GENRE"].astype(str))
print("Encoded genre count:", len(le.classes_))

# 5. Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(
    df["DESCRIPTION"],
    df["GENRE"],
    test_size=0.2,
    random_state=76
)

# 6. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, binary=True, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

print(" TF-IDF vectorization complete.")
print("Feature matrix shape:", X_train_tfidf.shape)


print("Naive Bayes", BernoulliNB())
print("Logistic Regression", LogisticRegression(max_iter=1000))
print("Support Vector Machine", LinearSVC())

results = {}


for name, model in models.items():
    print(f"\n🚀 Training {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_val_tfidf)
    acc = accuracy_score(y_val, y_pred)
    results[name] = acc
    print(f" {name} Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_val, y_pred, zero_division=0))

# 8. Compare Models
print("\n Model Comparison:")
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}")

best_model_name = max(results, key=results.get)
print(f"\n Best Model: {best_model_name} (Accuracy: {results[best_model_name]:.4f})")

# --------------------------------------------
# 9. Save Best Model, Vectorizer, Encoder
# --------------------------------------------
best_model = models[best_model_name]
joblib.dump(best_model, "best_genre_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")
print("\n Model, vectorizer, and label encoder saved successfully!")

# 10. Predict Genre for New Plot Summary
def predict_genre(plot_summary):
    text = preprocess_text(plot_summary)
    X_input = vectorizer.transform([text])
    pred = best_model.predict(X_input)
    genre = le.inverse_transform(pred)[0]
    return genre

example_plot = "A young boy discovers he is a wizard and attends a magical school to learn spells."
predicted_genre = predict_genre(example_plot)
print("\n Example Prediction:")
print(f"Plot: {example_plot}")
print(f"Predicted Genre: {predicted_genre}")