In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load dataset
def load_data(train_data.txt):
    df = pd.read_csv(train_data.txt, sep=' ::: ', header=None, engine='python')
    df.columns = ['ID', 'Title', 'Genre', 'Description']
    return df

# Preprocess data
def preprocess_data(df):
    X = df['Description']
    y = df['Genre']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
def train_model(X_train, y_train, model_type='naive_bayes'):
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    if model_type == 'naive_bayes':
        model = MultinomialNB()
    elif model_type == 'logistic_regression':
        model = LogisticRegression(max_iter=500)
    elif model_type == 'svm':
        model = SVC(kernel='linear', probability=True)
    else:
        raise ValueError("Invalid model type. Choose 'naive_bayes', 'logistic_regression', or 'svm'.")
    
    model.fit(X_train_tfidf, y_train)
    return vectorizer, model

# Evaluate model
def evaluate_model(vectorizer, model, X_test, y_test):
    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = model.predict(X_test_tfidf)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Predict new genres from test data
def predict_from_file(file_path, vectorizer, model):
    test_data = load_data(file_path)
    X_test = test_data['Description']
    test_tfidf = vectorizer.transform(X_test)
    predictions = model.predict(test_tfidf)
    test_data['Predicted_Genre'] = predictions
    test_data.to_csv('test_data_with_predictions.csv', index=False)
    print("Predictions saved to test_data_with_predictions.csv")

if __name__ == "__main__":
    # Load and preprocess data
    data = load_data('train_data.txt')
    X_train, X_test, y_train, y_test = preprocess_data(data)
    
    # Train and evaluate the model
    vectorizer, model = train_model(X_train, y_train, model_type='naive_bayes')
    evaluate_model(vectorizer, model, X_test, y_test)
    
    # Save the trained model
    joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
    joblib.dump(model, 'genre_classifier.pkl')
    
    # Predict genres for test data
    predict_from_file('test_data.txt', vectorizer, model)


SyntaxError: invalid syntax (1237361084.py, line 12)