In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
def parse_train_dataset(train_file_path):
    data = []
    with open(train_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(':::')
            if len(parts) == 4:
                data.append({
                    'Index': parts[0].strip(),
                    'Title': parts[1].strip(),
                    'Genre': parts[2].strip(),
                    'Plot': parts[3].strip()
                })
    return pd.DataFrame(data)

# Specify file path for training data
train_file_path = '/content/train_data.txt'

# Parse the training dataset
df_train = parse_train_dataset(train_file_path)

# Display the first few rows of df_train
print("Train Dataset:")
print(df_train.head())

Train Dataset:
  Index                             Title     Genre  \
0     1      Oscar et la dame rose (2009)     drama   
1     2                      Cupid (1997)  thriller   
2     3  Young, Wild and Wonderful (1980)     adult   
3     4             The Secret Sin (1915)     drama   
4     5            The Unrecovered (2007)     drama   

                                                Plot  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  


In [None]:
def parse_test_dataset(test_file_path):
    data = []
    with open(test_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(':::')
            if len(parts) == 3:
                data.append({
                    'Index': parts[0].strip(),
                    'Title': parts[1].strip(),
                    'Plot': parts[2].strip(),
                    'Genre': ''  # Empty column for Genre in test data
                })
    return pd.DataFrame(data)

# Specify file path for testing data
test_file_path = '/content/test_data.txt'

# Parse the testing dataset
df_test = parse_test_dataset(test_file_path)
# Display the first few rows of df_test
print("\nTest Dataset:")
print(df_test.head())


Test Dataset:
  Index                        Title  \
0     1         Edgar's Lunch (1998)   
1     2     La guerra de papá (1977)   
2     3  Off the Beaten Track (2010)   
3     4       Meu Amigo Hindu (2015)   
4     5            Er nu zhai (1955)   

                                                Plot Genre  
0  L.R. Brane loves his life - his car, his apart...        
1  Spain, March 1964: Quico is a very naughty chi...        
2  One year in the life of Albin and his family o...        
3  His father has died, he hasn't spoken with his...        
4  Before he was known internationally as a marti...        


In [None]:
 df_train = df_train.dropna(subset=['Title', 'Genre', 'Plot'])
df_test = df_test.dropna(subset=['Index', 'Plot'])

In [None]:
df_train['Plot'] = df_train['Plot'].str.lower()
df_test['Plot'] = df_test['Plot'].str.lower()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_train['Plot'], df_train['Genre'], test_size=0.2, random_state=42)

In [None]:
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  # Using TF-IDF for feature extraction
    ('clf', MultinomialNB()),  # Using Naive Bayes classifier
])

In [None]:
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  # Using TF-IDF for feature extraction
    ('clf', LogisticRegression(max_iter=1000)),  # Using Logistic Regression classifier
])

In [None]:
X_train = df_train['Plot']
y_train = df_train['Genre']
pipeline_nb.fit(X_train, y_train)
pipeline_lr.fit(X_train, y_train)

In [None]:
y_pred_nb = pipeline_nb.predict(X_val)
y_pred_lr = pipeline_lr.predict(X_val)

In [None]:
print("Naive Bayes Classification Report:")
print(classification_report(y_val, y_pred_nb))

print("\nLogistic Regression Classification Report:")
print(classification_report(y_val, y_pred_lr))

Naive Bayes Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.69      0.12      0.20       263
       adult       0.78      0.06      0.12       112
   adventure       0.56      0.04      0.07       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.55      0.46      0.50      1443
       crime       0.00      0.00      0.00       107
 documentary       0.58      0.90      0.70      2659
       drama       0.47      0.84      0.60      2697
      family       1.00      0.01      0.01       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.07      0.14        40
     history       0.00      0.00      0.00        45
      horror       0.78      0.38      0.51       431
       music       0.92      0.16      0.27       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00      0.00        56
        news       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
final_predictions_nb = pipeline_nb.predict(df_test['Plot'])
final_predictions_lr = pipeline_lr.predict(df_test['Plot'])


In [None]:
df_test['Predicted_Genre_NB'] = final_predictions_nb
df_test['Predicted_Genre_LR'] = final_predictions_lr

In [None]:
print("\nPredictions for Test Set using Naive Bayes:")
print(df_test[['Index','Plot', 'Predicted_Genre_NB']].head())



Predictions for Test Set using Naive Bayes:
  Index                                               Plot Predicted_Genre_NB
0     1  l.r. brane loves his life - his car, his apart...              drama
1     2  spain, march 1964: quico is a very naughty chi...              drama
2     3  one year in the life of albin and his family o...        documentary
3     4  his father has died, he hasn't spoken with his...              drama
4     5  before he was known internationally as a marti...              drama


In [None]:
print("\nPredictions for Test Set using Logistic Regression:")
print(df_test[['Index','Plot', 'Predicted_Genre_LR']].head())


Predictions for Test Set using Logistic Regression:
  Index                                               Plot Predicted_Genre_LR
0     1  l.r. brane loves his life - his car, his apart...              drama
1     2  spain, march 1964: quico is a very naughty chi...              drama
2     3  one year in the life of albin and his family o...        documentary
3     4  his father has died, he hasn't spoken with his...              drama
4     5  before he was known internationally as a marti...              drama
