<a href="https://colab.research.google.com/github/Roja0230/MachineLearning-CodSoft/blob/main/MOVIE_GENRE_CLASSIFICATION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run this cell
!pip install -q kaggle nltk joblib


In [None]:
from google.colab import files
import zipfile, os

print("Upload archive.zip now (choose file from your PC)...")
uploaded = files.upload()  # select archive.zip

# assume uploaded file name is archive.zip; if different, change below
zip_name = list(uploaded.keys())[0]
with zipfile.ZipFile(zip_name, 'r') as z:
    z.extractall('/content/data')

# list extracted files/folders
for root, dirs, files in os.walk('/content/data'):
    print(root)
    for f in files:
        print("   ", f)

Upload archive.zip now (choose file from your PC)...


Saving archive (1).zip to archive (1).zip
/content/data
/content/data/Genre Classification Dataset
    description.txt
    train_data.txt
    test_data_solution.txt
    test_data.txt


In [None]:
!file "archive (1).zip"
!ls -lh "archive (1).zip"


archive (1).zip: Zip archive data, at least v4.5 to extract, compression method=deflate
-rw-r--r-- 1 root root 42M Aug 24 05:24 'archive (1).zip'


In [None]:
import glob

train_paths = glob.glob('/content/data/**/train_data.txt', recursive=True)
test_paths  = glob.glob('/content/data/**/test_data.txt', recursive=True)

if train_paths and test_paths:
    train_path = train_paths[0]
    test_path = test_paths[0]
    print("Train file:", train_path)
    print("Test file:", test_path)
else:
    raise FileNotFoundError("train_data.txt or test_data.txt not found inside uploaded files.")

Train file: /content/data/Genre Classification Dataset/train_data.txt
Test file: /content/data/Genre Classification Dataset/test_data.txt


In [None]:
import pandas as pd

# Load train and test
train_df = pd.read_csv(train_path, sep='\t', header=None, names=['raw'])
test_df  = pd.read_csv(test_path,  sep='\t', header=None, names=['raw'])

# Split into ID, Title, Genre, Plot
train_df[['ID','Title','Genre','Plot']] = train_df['raw'].str.split(" ::: ", expand=True)
train_df.drop(columns=['raw'], inplace=True)

print("Train shape:", train_df.shape)
train_df.head()

Train shape: (54214, 4)


Unnamed: 0,ID,Title,Genre,Plot
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [None]:
merge_map = {
    'fantasy': 'fantasy/adventure',
    'adventure': 'fantasy/adventure',
    'musical': 'music/musical',
    'music': 'music/musical'
}
train_df['Genre'] = train_df['Genre'].replace(merge_map)


In [None]:
min_samples = 500
valid_genres = train_df['Genre'].value_counts()[train_df['Genre'].value_counts() >= min_samples].index
train_df = train_df[train_df['Genre'].isin(valid_genres)].reset_index(drop=True)

print("Remaining genres:", train_df['Genre'].nunique())
print(train_df['Genre'].value_counts())


Remaining genres: 16
Genre
drama                13613
documentary          13096
comedy                7447
short                 5073
horror                2204
thriller              1591
action                1315
fantasy/adventure     1098
western               1032
music/musical         1008
reality-tv             884
family                 784
romance                672
sci-fi                 647
adult                  590
crime                  505
Name: count, dtype: int64


In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(s):
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    tokens = s.split()
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

train_df['Clean_Plot'] = train_df['Plot'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#TF-IDF + train/test split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X = tfidf.fit_transform(train_df['Clean_Plot'])
y = train_df['Genre']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
#Train Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=2000, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.5554693560899923
                   precision    recall  f1-score   support

           action       0.33      0.54      0.41       263
            adult       0.45      0.71      0.55       118
           comedy       0.61      0.54      0.57      1489
            crime       0.16      0.30      0.21       101
      documentary       0.81      0.70      0.75      2619
            drama       0.70      0.46      0.56      2723
           family       0.21      0.41      0.28       157
fantasy/adventure       0.24      0.36      0.29       220
           horror       0.59      0.70      0.64       441
    music/musical       0.43      0.72      0.54       202
       reality-tv       0.32      0.66      0.43       177
          romance       0.13      0.36      0.19       134
           sci-fi       0.33      0.52      0.40       129
            short       0.44      0.45      0.44      1015
         thriller       0.29      0.38      0.32       318
          western       0.

In [None]:
#  Prediction function

def predict_genre(plot_text):
    plot_text = clean_text(plot_text)
    vec = tfidf.transform([plot_text])
    return model.predict(vec)[0]

# Example
print(predict_genre("A young wizard attends a magical school and faces a dark enemy."))


fantasy/adventure


In [None]:
#Compare Logistic Regression, Naive Bayes, and SVM


from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, class_weight='balanced'),
    "MultinomialNB": MultinomialNB(),
    "LinearSVC": LinearSVC(max_iter=20000, class_weight='balanced')
}

for name, clf in models.items():
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print(classification_report(y_test, preds, zero_division=0))


=== LogisticRegression ===
Accuracy: 0.5554693560899923
                   precision    recall  f1-score   support

           action       0.33      0.54      0.41       263
            adult       0.45      0.71      0.55       118
           comedy       0.61      0.54      0.57      1489
            crime       0.16      0.30      0.21       101
      documentary       0.81      0.70      0.75      2619
            drama       0.70      0.46      0.56      2723
           family       0.21      0.41      0.28       157
fantasy/adventure       0.24      0.36      0.29       220
           horror       0.59      0.70      0.64       441
    music/musical       0.43      0.72      0.54       202
       reality-tv       0.32      0.66      0.43       177
          romance       0.13      0.36      0.19       134
           sci-fi       0.33      0.52      0.40       129
            short       0.44      0.45      0.44      1015
         thriller       0.29      0.38      0.32       31

In [None]:
# Choose best model for prediction

best_model = models["LogisticRegression"]  # Change if NB or SVC is better

def predict_genre_best(plot_text):
    plot_text = clean_text(plot_text)
    vec = tfidf.transform([plot_text])
    return best_model.predict(vec)[0]

# Example
print(predict_genre_best("A group of explorers discover a lost city and face many dangers."))


fantasy/adventure


In [None]:
#  Save the model & vectorizer

import joblib
from google.colab import files

# Save best model and TF-IDF vectorizer
joblib.dump(best_model, "genre_model.joblib")
joblib.dump(tfidf, "tfidf_vectorizer.joblib")

# Download them to your computer
files.download("genre_model.joblib")
files.download("tfidf_vectorizer.joblib")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#  Load model later

import joblib

# Load saved model and vectorizer
model = joblib.load("genre_model.joblib")
tfidf = joblib.load("tfidf_vectorizer.joblib")

# Prediction function
def predict_genre(plot_text):
    plot_text = clean_text(plot_text)  # use same cleaning function from before
    vec = tfidf.transform([plot_text])
    return model.predict(vec)[0]

# Test
print(predict_genre("A superhero must save the city from a powerful villain."))


action


In [None]:
import joblib
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load model and vectorizer
model = joblib.load("/content/genre_model.joblib")
tfidf = joblib.load("/content/tfidf_vectorizer.joblib")

# Text cleaner (must match training preprocessing)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(s):
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    tokens = s.split()
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Prediction function
def predict_genre(plot_text):
    plot_text = clean_text(plot_text)
    vec = tfidf.transform([plot_text])
    return model.predict(vec)[0]

# Test
print(predict_genre("A young wizard discovers his magical heritage and battles a dark enemy at a magic school."))


fantasy/adventure


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
