In [14]:
#Step 1: Import Required Libraries

# Data handling
import pandas as pd

# Machine learning utilities
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Model evaluation
from sklearn.metrics import accuracy_score, classification_report

# Save model
import joblib


In [15]:
#pandas=loads and manages data
#tfidfVectorizer = Convert text to numbers
#LinearSVC = ML algorithm for classification
#Pipeline = Professional ML workflow

In [16]:
#Step2:Load the Dataset

# File location 
file_path = r"C:\Users\DeLL\Downloads\archive (1)\Genre Classification Dataset\train_data.txt"

# Load text file into DataFrame
data = pd.read_csv(
    file_path,
    sep=":::",
    engine="python",
    names=["movie_id", "movie_title", "category", "summary"]
)

# Display dataset info
print(data.shape)
data.head()


(54214, 4)


Unnamed: 0,movie_id,movie_title,category,summary
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [17]:
#dataset is text-based (.txt)
#sep=":::" separates columns
#Columns renamed for clarity

In [18]:
#Step 3: Clean the Data

# Remove empty summaries or categories
data = data.dropna(subset=["summary", "category"])

# Shuffle dataset for randomness
data = data.sample(frac=1, random_state=7).reset_index(drop=True)

print("Cleaned data size:", data.shape)


Cleaned data size: (54214, 4)


In [19]:
#ML models cannot work with empty values
#shuffling improves learning quality

In [20]:
#Step 4: Separate features and Target

# Input text (movie summary)
text_data = data["summary"]

# Output label (genre)
labels = data["category"]


In [21]:
#Step 5:Split data into Train & Test

X_train, X_test, y_train, y_test = train_test_split(
    text_data,
    labels,
    test_size=0.25,
    random_state=10,
    stratify=labels
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


Train size: 40660
Test size: 13554


In [22]:
#Traning data teaches the model
#Testing data checks accuracy
#stratify=y keeps all genres balanced

In [23]:
#Step 6: Create ML Pipeline
#pipeline combines vectorization and classification
genre_model = Pipeline([
    ("vectorizer", TfidfVectorizer(
        stop_words="english",
        max_df=0.9,
        min_df=5,
        ngram_range=(1, 2)
    )),
    ("classifier", LinearSVC())
])


In [24]:
#TF-IDF converts text to numbers
#SVM classifies genres
#Pipeline avoids manual mistakes

In [25]:
#Step 7: Train the Model

#Train the model using training data 
genre_model.fit(X_train, y_train)
print("Model training completed âœ…")


Model training completed âœ…


In [26]:
#Model learns patterns from movie descriptions

In [27]:
#Step 8: Evaluate Model Performance

#Predict genres for test data
predictions = genre_model.predict(X_test)

#print accuracy score
print("Accuracy:", accuracy_score(y_test, predictions))

#Detailed classification report
print("\nDetailed Report:\n")
print(classification_report(y_test, predictions))


Accuracy: 0.5838866755201416

Detailed Report:

               precision    recall  f1-score   support

      action        0.39      0.28      0.33       329
       adult        0.70      0.45      0.55       148
   adventure        0.42      0.20      0.27       194
   animation        0.39      0.15      0.21       124
   biography        0.00      0.00      0.00        66
      comedy        0.53      0.60      0.56      1862
       crime        0.33      0.05      0.08       126
 documentary        0.69      0.83      0.75      3274
       drama        0.56      0.72      0.63      3403
      family        0.37      0.13      0.19       196
     fantasy        0.17      0.02      0.04        81
   game-show        0.88      0.62      0.73        48
     history        0.18      0.03      0.06        61
      horror        0.63      0.65      0.64       551
       music        0.68      0.57      0.62       183
     musical        0.67      0.09      0.15        69
     mystery    

In [28]:
#Accuracy = overall correctness
#Precision,Recall,F1 = genre-wise performance

In [29]:
#Step 9: Predict Genre for New Movie

#New movie description
new_plot = [
    "A fearless officer fights a powerful criminal gang to protect his city and family"
]

#predict genre
predicted_output = genre_model.predict(new_plot)
print("Predicted Genre:", predicted_output[0])


Predicted Genre:  action 


In [30]:
#model predicts genre from unseen text

In [31]:
#Step 10: Save the model

#save trained model for future use
joblib.dump(genre_model, "movie_genre_pipeline.pkl")
print("Model saved successfully ðŸ’¾")


Model saved successfully ðŸ’¾


In [32]:
#Saved model can be reused without retraining