## Implementing Movie genre classification model based on description

### Importing the datasets

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB
import re

In [3]:
train_path = "Genre Classification Dataset/train_data.txt"
train_data = pd.read_csv(train_path, sep=':::', names=['Title', 'Genre', 'Description'], engine='python')

In [22]:
test_path = "Genre Classification Dataset/test_data_solution.txt"
test_data = pd.read_csv(test_path, sep=':::', names=['Id', 'Title', 'Description'], engine='python')

## Understanding the data

In [None]:
train_data.describe()

In [None]:
test_data["Title"]

In [None]:
# checking for missing values
print(train_data.isnull().sum())
test_data.isnull().sum()

In [None]:
# checking frequency of each genre
class_distribution = train_data['Genre'].value_counts()
plt.figure(figsize=(8, 6))
class_distribution.plot(kind='bar', color='skyblue')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.xticks(rotation=65)
plt.show()

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [10]:
# preprocessing the text data
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuations
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

train_data['Cleaned_Description'] = train_data['Description'].apply(preprocess_text)

In [None]:
train_data.head()

In [12]:
# Using TF-IDF to convert text data to numerical data
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(train_data['Cleaned_Description'])
y = train_data['Genre']

In [13]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Training a Logistic Regression model
model_1 = LogisticRegression(max_iter=1000)
model_1.fit(X_train, y_train)

y_pred_1 = model_1.predict(X_test)
accuracy_1 = accuracy_score(y_test, y_pred_1)
print("Accuracy of Logistic Regression model: ", accuracy_1)

print("Precision, Recall and F1 Score of Logistic Regression model: ")
print(classification_report(y_test, y_pred_1))

In [None]:
# Now creating a model using Support Vector Machine
from sklearn.svm import SVC
# model_2 = SVC()
# model_2.fit(X_train, y_train)
# accuracy_2 = model_2.score(X_test, y_test)
# print("Accuracy of Support Vector Machine model: ", accuracy_2)
# print("Precision, Recall and F1 Score of SVC model: ")
# print(classification_report(y_test, model_2.predict(X_test)))

In [41]:
# seeing if scaling data helps in faster training of SVC model

from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.fit_transform(X_test)
model_2 = SVC()
model_2.fit(X_train_scaled, y_train)
accuracy_2 = model_2.score(X_test_scaled, y_test)

In [None]:
model_3 = MultinomialNB()
model_3.fit(X_train, y_train)
accuracy_3 = model_3.score(X_test, y_test)
print("Accuracy of Multinomial Naive Bayes model: ", accuracy_3)
print("Precision, Recall and F1 Score of Multinomial Naive Bayes model: ")
print(classification_report(y_test, model_3.predict(X_test)))

In [36]:
from sklearn.ensemble import RandomForestClassifier, Gradient
model_4=RandomForestClassifier()
model_4.fit(X_train, y_train)
y_pred_4 = model_4.predict(X_test)
accuracy_4 = accuracy_score(y_test, y_pred_4)
print("Accuracy of Random Forest model: ", accuracy_4)
print("Precision, Recall and F1 Score of Random Forest model: ")
print(classification_report(y_test, y_pred_4))

Accuracy of Random Forest model:  0.5008761412893111
Precision, Recall and F1 Score of Random Forest model: 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

      action        0.43      0.01      0.02       263
       adult        0.92      0.10      0.18       112
   adventure        0.48      0.10      0.17       139
   animation        0.00      0.00      0.00       104
   biography        0.00      0.00      0.00        61
      comedy        0.47      0.31      0.37      1443
       crime        1.00      0.01      0.02       107
 documentary        0.59      0.87      0.70      2659
       drama        0.43      0.84      0.56      2697
      family        1.00      0.03      0.06       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.88      0.53      0.66        40
     history        0.00      0.00      0.00        45
      horror        0.59      0.19      0.29       431
       music        0.61      0.19      0.29       144
     musical        0.00      0.00      0.00        50
     mystery        0.00      0.00      0.00        56
        n

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# now predicting genre of test data
test_data['Cleaned_Description'] = test_data['Description'].apply(preprocess_text)
X_test_data = tfidf.transform(test_data['Cleaned_Description'])
model_1_preds = model_1.predict(X_test_data)

print("Accuracy on test data: ", accuracy_score(test_data['Title'], model_1_preds))

Accuracy on test data:  0.6287453874538745


In [34]:
test_data.head()

Unnamed: 0,Id,Title,Description,Cleaned_Description,Predicted_Genre
1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar...",lr brane loves life car apartment job especial...,drama
2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch...",spain march 1964 quico naughty child three bel...,drama
3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...,one year life albin family shepherds north tra...,documentary
4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi...",father died hasnt spoken brother 10 years seri...,drama
5,Er nu zhai (1955),drama,Before he was known internationally as a mart...,known internationally martial arts superstar b...,drama


In [35]:
# now predicting genre of test data
model_4_preds = model_4.predict(X_test_data)

print("Accuracy on test data: ", accuracy_score(test_data['Title'], model_4_preds))

Accuracy on test data:  0.47440959409594097


In [32]:
# now predicting genre of test data
model_3_preds = model_3.predict(X_test_data)

print("Accuracy on test data: ", accuracy_score(test_data['Title'], model_3_preds))

Accuracy on test data:  0.5186715867158671


In [None]:
test_data.head()

In [None]:
import joblib
# saving the models
joblib.dump(model_1, 'logistic.pkl') 
joblib.dump(model_2, 'SVC.pkl')
joblib.dump(model_3, 'Naive_Bayes.pkl')
joblib.dump(model_4, 'Random_Forest.pkl')

['Random_Forest.pkl']

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import LeakyReLU

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the model
model_5 = Sequential()
model_5.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))
model_5.add(Dropout(0.5))
model_5.add(Dense(256, activation='relu'))
model_5.add(LeakyReLU(alpha=0.1))
model_5.add(Dropout(0.5))
model_5.add(Dense(len(train_data['Genre'].unique()), activation='softmax'))

model_5.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model_5.fit(X_train, y_train_encoded, 
            epochs=10, batch_size=64, validation_data=(X_test, y_test_encoded))

model_5.save('deep_learning_model.h5')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 38ms/step - accuracy: 0.4161 - loss: 2.1141 - val_accuracy: 0.5645 - val_loss: 1.4815
Epoch 2/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 38ms/step - accuracy: 0.6156 - loss: 1.3343 - val_accuracy: 0.5827 - val_loss: 1.4112
Epoch 3/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 41ms/step - accuracy: 0.6793 - loss: 1.0759 - val_accuracy: 0.5799 - val_loss: 1.4226
Epoch 4/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 37ms/step - accuracy: 0.7417 - loss: 0.8414 - val_accuracy: 0.5739 - val_loss: 1.5124
Epoch 5/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 34ms/step - accuracy: 0.8101 - loss: 0.6244 - val_accuracy: 0.5673 - val_loss: 1.6448
Epoch 6/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 35ms/step - accuracy: 0.8634 - loss: 0.4437 - val_accuracy: 0.5598 - val_loss: 1.8150
Epoch 7/10
[1m6



In [56]:
# accuracy of deep learning model
_, accuracy = model_5.evaluate(X_test, y_test_encoded)
print(accuracy)

[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.5511 - loss: 2.5787
0.5547357797622681
