In [5]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopword')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Error loading stopword: Package 'stopword' not found in
[nltk_data]     index
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
file_path="train_data.txt"

# Load dataset


In [7]:
df = pd.read_csv(file_path, sep=":::", engine="python", header=None)
df.columns = ["id", "title", "genre", "plot"]

In [8]:
print("Shape:", df.shape)

Shape: (54214, 4)


In [9]:
print("head:",df.head)

head: <bound method NDFrame.head of           id                                         title          genre  \
0          1                 Oscar et la dame rose (2009)          drama    
1          2                                 Cupid (1997)       thriller    
2          3             Young, Wild and Wonderful (1980)          adult    
3          4                        The Secret Sin (1915)          drama    
4          5                       The Unrecovered (2007)          drama    
...      ...                                           ...            ...   
54209  54210                              "Bonino" (1953)         comedy    
54210  54211                  Dead Girls Don't Cry (????)         horror    
54211  54212    Ronald Goedemondt: Ze bestaan echt (2008)    documentary    
54212  54213                     Make Your Own Bed (1944)         comedy    
54213  54214   Nature's Fury: Storm of the Century (2006)        history    

                                       

In [10]:
print("\nTotal genres:", df["genre"].nunique())


Total genres: 27


In [11]:
print("\nTop 10 genres:\n")
print(df["genre"].value_counts().head(10))


Top 10 genres:

genre
drama           13613
documentary     13096
comedy           7447
short            5073
horror           2204
thriller         1591
action           1315
western          1032
reality-tv        884
family            784
Name: count, dtype: int64


In [12]:
print("number of unique genre:",df.genre)

number of unique genre: 0               drama 
1            thriller 
2               adult 
3               drama 
4               drama 
             ...      
54209          comedy 
54210          horror 
54211     documentary 
54212          comedy 
54213         history 
Name: genre, Length: 54214, dtype: object


# STEP 2: Clean the Data

In [13]:
# Remove null values
df=df.dropna()

In [14]:
# Remove duplicates
df=df.drop_duplicates()

In [15]:
# Strip extra spaces
df["plot"] = df["plot"].str.strip()
df["genre"] = df["genre"].str.strip()

print("After cleaning shape:", df.shape)

After cleaning shape: (54214, 4)


# STEP 3: Text Preprocessing

In [16]:

stop_words=set(stopwords.words('english'))
print(list(stop_words)[:10])




['yourselves', 'its', 'how', "i'll", "she's", 'being', "don't", 'am', 'shan', 'needn']


In [17]:
lemmatizer=WordNetLemmatizer()
def clean_text(text):
    text=text.lower()
    text=re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

#Apply cleaning
df["clean_plot"]=df["plot"].apply(clean_text)
print(df[["plot","clean_plot"]].head())

                                                plot  \
0  Listening in to a conversation between his doc...   
1  A brother and sister with a past incestuous re...   
2  As the bus empties the students for their fiel...   
3  To help their unemployed father make ends meet...   
4  The film's title refers not only to the un-rec...   

                                          clean_plot  
0  listening conversation doctor parent yearold o...  
1  brother sister past incestuous relationship cu...  
2  bus empty student field trip museum natural hi...  
3  help unemployed father make end meet edith twi...  
4  film title refers unrecovered body ground zero...  


# Train-Test Split

In [18]:

X=df["clean_plot"]
y=df["genre"]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
print("Training samples:",X_train.shape)
print("Training samples:",X_test.shape)

Training samples: (43371,)
Training samples: (10843,)


# TF-IDF VECTORIZATION:Term Frequency-inverse document frequency

In [19]:

tfidf=TfidfVectorizer(max_features=5000,ngram_range=(1,2))

X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)
print("TF-IDF shape:",X_train_tfidf.shape)

TF-IDF shape: (43371, 5000)


# Train Models


In [24]:
#Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nb_model=MultinomialNB()
nb_model.fit(X_train_tfidf,y_train)
nb_preds=nb_model.predict(X_test_tfidf)

In [25]:
#logistic regression
from sklearn.linear_model import LogisticRegression
lr_model=LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf,y_train)

lr_preds=lr_model.predict(X_test_tfidf)

# Evaluation

In [26]:
from sklearn.metrics import classification_report, accuracy_score

print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds))

print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))
print(classification_report(y_test, lr_preds))


Naive Bayes Accuracy: 0.5207046020474039
              precision    recall  f1-score   support

      action       0.66      0.10      0.17       263
       adult       0.69      0.09      0.16       118
   adventure       0.64      0.06      0.11       155
   animation       0.00      0.00      0.00       100
   biography       0.00      0.00      0.00        53
      comedy       0.51      0.44      0.47      1490
       crime       0.00      0.00      0.00       101
 documentary       0.57      0.88      0.69      2619
       drama       0.46      0.81      0.58      2723
      family       0.00      0.00      0.00       157
     fantasy       0.00      0.00      0.00        65
   game-show       1.00      0.18      0.30        39
     history       0.00      0.00      0.00        49
      horror       0.77      0.34      0.47       441
       music       0.71      0.08      0.15       146
     musical       0.00      0.00      0.00        55
     mystery       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

      action       0.48      0.26      0.34       263
       adult       0.72      0.29      0.41       118
   adventure       0.55      0.12      0.19       155
   animation       0.50      0.08      0.14       100
   biography       0.00      0.00      0.00        53
      comedy       0.53      0.59      0.56      1490
       crime       0.14      0.02      0.03       101
 documentary       0.67      0.85      0.75      2619
       drama       0.54      0.77      0.63      2723
      family       0.45      0.09      0.15       157
     fantasy       0.00      0.00      0.00        65
   game-show       0.86      0.49      0.62        39
     history       0.00      0.00      0.00        49
      horror       0.64      0.58      0.61       441
       music       0.66      0.42      0.51       146
     musical       0.67      0.04      0.07        55
     mystery       0.33      0.02      0.03        64
        news       0.67    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Test on Your Own Text

In [27]:
sample = ["A young boy discovers he has magical powers and attends a wizard school"]

sample_clean = clean_text(sample[0])
sample_vec = tfidf.transform([sample_clean])

print("Naive Bayes Prediction:", nb_model.predict(sample_vec)[0])
print("Logistic Regression Prediction:", lr_model.predict(sample_vec)[0])


Naive Bayes Prediction: drama
Logistic Regression Prediction: short


# Save Model

In [28]:
import joblib

joblib.dump(lr_model, "movie_genre_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']