In [1]:
##STEP 1: Upload Dataset
from google.colab import files
uploaded = files.upload()


Saving archive (1) (1).zip to archive (1) (1).zip


In [3]:
import zipfile
import io

zip_ref = zipfile.ZipFile(io.BytesIO(uploaded['archive (1) (1).zip']), 'r')
zip_ref.extractall('/content/dataset')
zip_ref.close()

In [11]:
import os

os.listdir('/content')


['.config', 'dataset', 'archive (1) (1).zip', 'sample_data']

In [12]:
os.listdir('/content/dataset')


['Genre Classification Dataset']

In [14]:
import zipfile

# The original error was FileNotFoundError because the path was incorrect.
# The uploaded file is named 'archive (1) (1).zip', not 'archive (1).zip'.
zip_path = "/content/archive (1) (1).zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    print(zip_ref.namelist())

['Genre Classification Dataset/description.txt', 'Genre Classification Dataset/test_data.txt', 'Genre Classification Dataset/test_data_solution.txt', 'Genre Classification Dataset/train_data.txt']


In [15]:
import zipfile

zip_path = "/content/archive (1) (1).zip"
extract_path = "/content/dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction Complete")


Extraction Complete


In [16]:
##STEP 2: Load Dataset
import os
os.listdir('/content/dataset')


['Genre Classification Dataset']

In [22]:
import pandas as pd

# The error 'IsADirectoryError: [Errno 21] Is a directory' means that you are trying to read a directory as if it were a file.
# '/content/dataset' is a directory, not a CSV file.
# You need to specify the path to an actual data file *inside* that directory.

# Based on the content of the zip file, 'train_data.txt' is a likely candidate for the main training data.
# Genre classification datasets often have tab-separated values without a header.
# You might need to adjust 'train_data.txt' to the actual data file name (e.g., 'test_data.txt', 'description.txt')
# and also adjust the 'sep' parameter if it's not tab-separated (e.g., sep=',').

df = pd.read_csv("/content/dataset/Genre Classification Dataset/train_data.txt", sep='\t', header=None, names=['text', 'label'])
df.head()

Unnamed: 0,text,label
0,1 ::: Oscar et la dame rose (2009) ::: drama :...,
1,2 ::: Cupid (1997) ::: thriller ::: A brother ...,
2,"3 ::: Young, Wild and Wonderful (1980) ::: adu...",
3,4 ::: The Secret Sin (1915) ::: drama ::: To h...,
4,5 ::: The Unrecovered (2007) ::: drama ::: The...,


In [23]:
df.columns


Index(['text', 'label'], dtype='object')

In [25]:
##STEP 3: Clean Text
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

# The original error was KeyError: 'plot' because there is no 'plot' column in the DataFrame.
# Based on the previous DataFrame structure, the text content is in the 'text' column.
df['clean_plot'] = df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
##STEP 4: Convert Text to Numbers (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(df['clean_plot'])

# The original error was KeyError: 'genre' because there is no 'genre' column in the DataFrame.
# Looking at the 'text' column, the genre seems to be embedded in the format 'ID ::: Title (Year) ::: Genre ::: Description'.
# We need to extract the genre from the 'text' column first.
df['genre'] = df['text'].apply(lambda x: x.split(' ::: ')[2])

y = df['genre']

In [29]:
##STEP 5: Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [30]:
##STEP 6: Train Model (Naive Bayes)
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)


In [31]:
##STEP 7: Evaluate Model
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5977128101079037
              precision    recall  f1-score   support

      action       0.75      0.17      0.28       263
       adult       1.00      0.11      0.19       112
   adventure       0.67      0.09      0.15       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.64      0.62      0.63      1443
       crime       0.00      0.00      0.00       107
 documentary       0.62      0.92      0.74      2659
       drama       0.52      0.88      0.65      2697
      family       1.00      0.01      0.01       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.15      0.26        40
     history       0.00      0.00      0.00        45
      horror       0.82      0.55      0.66       431
       music       0.89      0.17      0.28       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00      0.00        5

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
##STEP 8: Try Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_lr))


Accuracy: 0.9352577699898552


In [33]:
##STEP 9: Test with Custom Movie
new_movie = ["A group of astronauts travel through a wormhole in space"]

cleaned = clean_text(new_movie[0])
vectorized = vectorizer.transform([cleaned])

prediction = model.predict(vectorized)
print("Predicted Genre:", prediction[0])


Predicted Genre: documentary
