In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import re
import string

In [9]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [10]:
# Load and preprocess training data
train_data = []
with open('train_data.txt', 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split(' ::: ')
        if len(parts) == 4:
            index, movie_name, genre, plot = parts
            train_data.append({'index': index, 'movie_name': movie_name, 'genre': genre, 'plot': preprocess_text(plot)})

train_df = pd.DataFrame(train_data)

In [11]:
# Vectorize the plot summaries
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf_vectorizer.fit_transform(train_df['plot'])
y = train_df['genre']

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    #'Support Vector Machine': SVC(kernel='linear')
}

In [14]:
# Train models and evaluate
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'--- {model_name} ---')
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy*100:.4f}\n")

--- Logistic Regression ---
Accuracy: 57.6040

--- Naive Bayes ---
Accuracy: 52.2088



In [15]:
# Load and preprocess test data
test_data = []
with open('test_data.txt', 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split(' ::: ')
        if len(parts) == 3:
            index, movie_name, plot = parts
            test_data.append({'index': index, 'movie_name': movie_name, 'plot': preprocess_text(plot)})

test_df = pd.DataFrame(test_data)

In [16]:
# Vectorize the test data plots
X_new = tfidf_vectorizer.transform(test_df['plot'])

In [17]:
# Predict genres using each model
predictions = {}
for model_name, model in models.items():
    predictions[model_name] = model.predict(X_new)

In [18]:
# Compile results into a DataFrame
results_df = test_df[['index', 'movie_name']].copy()
for model_name in models.keys():
    results_df[model_name] = predictions[model_name]

In [20]:
# Save results to an Excel file
results_df.to_excel('genre_predictions.xlsx', index=False)

In [21]:
# Display the first few rows of the DataFrame
print(results_df.head())

  index                   movie_name Logistic Regression  Naive Bayes
0     1         Edgar's Lunch (1998)               short        drama
1     2     La guerra de papá (1977)               drama        drama
2     3  Off the Beaten Track (2010)         documentary  documentary
3     4       Meu Amigo Hindu (2015)               drama        drama
4     5            Er nu zhai (1955)               drama        drama
