In [19]:
import pandas as pd
import re
import nltk

In [15]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [17]:
with open("/content/train_data.txt", "r") as file:
    text_data = file.read()
    matches = re.findall(r"\(\s*:::\s*(.*?)\s*:::\s*(.*?)\s*\)\s*", text_data)
    df = pd.DataFrame([{"description": desc, "genre": genre} for genre, desc in matches])
print(text_data)
nltk.download('punkt')
nltk.download('stopwords')

In [18]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    return ' '.join(tokens)

In [20]:
if not df.empty:
    df['description'] = df['description'].apply(preprocess_text)
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(df['description'])
    y = df['genre']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = MultinomialNB().fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    joblib.dump(clf, 'movie_genre_classifier.pkl')

    # Load test data from file
    with open("/content/test_data.txt", "r") as file:
        test_text_data = file.read()
        test_matches = re.findall(r"\(\s*:::\s*(.*?)\s*:::\s*(.*?)\s*\)\s*", test_text_data)
        test_df = pd.DataFrame([{"description": desc, "genre": genre} for genre, desc in test_matches])

    # Preprocess test data
    test_df['description'] = test_df['description'].apply(preprocess_text)
    test_X = vectorizer.transform(test_df['description'])
    test_y_pred = clf.predict(test_X)

    # Save test predictions to file
    with open("test_predictions.txt", "w") as file:
        for pred in test_y_pred:
            file.write(str(pred) + "\n")
else:
    print("No data found. Check the regex pattern and the content of 'train_data.txt'.")

No data found. Check the regex pattern and the content of 'train_data.txt'.
