In [1]:
!pip install tabulate
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from tabulate import tabulate  # Import tabulate for printing tables

# Load training data
train_data = pd.read_csv('/content/train_data.txt', delimiter=' ::: ', engine='python', header=None, names=['ID', 'Title', 'Genre', 'Description'])

# Load test data
test_data = pd.read_csv('/content/test_data.txt', delimiter=' ::: ', engine='python', header=None, names=['ID', 'Title', 'Description'])

# Combine title and description for feature extraction
train_data['Text'] = train_data['Title'] + ' ' + train_data['Description']
test_data['Text'] = test_data['Title'] + ' ' + test_data['Description']

# Extract features and labels
X_train = train_data['Text']
y_train = train_data['Genre']
X_test = test_data['Text']

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Transform text data into TF-IDF features
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Naive Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train Logistic Regression Classifier
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Train Support Vector Machine Classifier
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_tfidf, y_train)

# Evaluate Naive Bayes
nb_pred_train = nb_model.predict(X_train_tfidf)
print("Naive Bayes Training Accuracy: ", accuracy_score(y_train, nb_pred_train))
print(classification_report(y_train, nb_pred_train))

# Evaluate Logistic Regression
lr_pred_train = lr_model.predict(X_train_tfidf)
print("Logistic Regression Training Accuracy: ", accuracy_score(y_train, lr_pred_train))
print(classification_report(y_train, lr_pred_train))

# Evaluate SVM
svm_pred_train = svm_model.predict(X_train_tfidf)
print("SVM Training Accuracy: ", accuracy_score(y_train, svm_pred_train))
print(classification_report(y_train, svm_pred_train))

# Assuming Logistic Regression performed best
test_pred = lr_model.predict(X_test_tfidf)

# Add predictions to the test data
test_data['Predicted_Genre'] = test_pred

# Create a table with the ID, Title, and Predicted Genre
table = test_data[['ID', 'Title', 'Predicted_Genre']].values.tolist()

# Print the table using tabulate
print(tabulate(table, headers=['ID', 'Title', 'Predicted_Genre'], tablefmt='grid'))



Naive Bayes Training Accuracy:  0.5339882744414515
              precision    recall  f1-score   support

      action       0.00      0.00      0.00       149
       adult       0.00      0.00      0.00        77
   adventure       0.00      0.00      0.00        87
   animation       0.00      0.00      0.00        57
   biography       0.00      0.00      0.00        38
      comedy       0.76      0.48      0.59       869
       crime       0.00      0.00      0.00        51
 documentary       0.59      0.94      0.73      1530
       drama       0.44      0.93      0.60      1557
      family       0.00      0.00      0.00        88
     fantasy       0.00      0.00      0.00        40
   game-show       0.00      0.00      0.00        26
     history       0.00      0.00      0.00        27
      horror       1.00      0.06      0.11       248
       music       0.00      0.00      0.00        77
     musical       0.00      0.00      0.00        33
     mystery       0.00      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.95      0.13      0.24       149
       adult       0.00      0.00      0.00        77
   adventure       1.00      0.10      0.19        87
   animation       0.00      0.00      0.00        57
   biography       0.00      0.00      0.00        38
      comedy       0.70      0.78      0.74       869
       crime       0.00      0.00      0.00        51
 documentary       0.67      0.96      0.79      1530
       drama       0.59      0.94      0.73      1557
      family       1.00      0.01      0.02        88
     fantasy       0.00      0.00      0.00        40
   game-show       1.00      0.15      0.27        26
     history       0.00      0.00      0.00        27
      horror       0.91      0.56      0.69       248
       music       1.00      0.18      0.31        77
     musical       0.00      0.00      0.00        33
     mystery       0.00      0.00      0.00        40
        news       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


+------+-------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|   ID | Title                                                                                                                               | Predicted_Genre   |
|    1 | Edgar's Lunch (1998)                                                                                                                | drama             |
+------+-------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|    2 | La guerra de papá (1977)                                                                                                            | drama             |
+------+-------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|    3 | Off the Beate