In [1]:
import re
import pandas as pd

train_data_path = 'Genre Classification Dataset/train_data.txt'
test_data_path = 'Genre Classification Dataset/test_data.txt'
test_data_soln_path = 'Genre Classification Dataset/test_data_solution.txt'

train = pd.read_csv(train_data_path, sep=':::', engine='python', names=['index', 'movie_title', 'genres', 'plot_summary'])
test = pd.read_csv(test_data_path, sep=':::', engine='python', names=['index', 'movie_title', 'plot_summary'])
test_solution = pd.read_csv(test_data_soln_path, sep=':::', engine='python', names=['index', 'movie_title', 'genres', 'plot_summary'])

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespaces
    text = ' '.join(text.split())
    
    return text

train['clean_plot_summary'] = train['plot_summary'].apply(clean_text)
test['clean_plot_summary'] = test['plot_summary'].apply(clean_text)

In [2]:
genre_mapping = {
    'drama': 1,
    'documentary': 2,
    'comedy': 3,
    'short': 4,
    'horror': 5,
    'thriller': 6,
    'action': 7,
    'western': 8,
    'reality-tv': 9,
    'family': 10,
    'adventure': 11,
    'music': 12,
    'romance': 13,
    'sci-fi': 14,
    'adult': 15,
    'crime': 16,
    'animation': 17,
    'sport': 18,
    'talk-show': 19,
    'fantasy': 20,
    'mystery': 21,
    'musical': 22,
    'biography': 23,
    'history': 24,
    'game-show': 25,
    'news': 26,
    'war': 27
}

train['genres'] = train['genres'].str.strip()
train.dropna(subset=['genres'], inplace=True)
train['genres_encoded'] = train.genres.map(genre_mapping)
train.drop(['movie_title', 'index', 'plot_summary', 'genres'], axis=1, inplace=True)
train.head()

Unnamed: 0,clean_plot_summary,genres_encoded
0,listening in to a conversation between his doc...,1
1,a brother and sister with a past incestuous re...,6
2,as the bus empties the students for their fiel...,15
3,to help their unemployed father make ends meet...,1
4,the films title refers not only to the unrecov...,1


In [3]:
test.drop(['movie_title', 'index', 'plot_summary'], axis=1, inplace=True)
test.head()

Unnamed: 0,clean_plot_summary
0,lr brane loves his life his car his apartment ...
1,spain march quico is a very naughty child of t...
2,one year in the life of albin and his family o...
3,his father has died he hasnt spoken with his b...
4,before he was known internationally as a marti...


In [4]:
test_solution['genres'] = test_solution['genres'].str.strip()
test_solution.dropna(subset=['genres'], inplace=True)
test_solution['genres_encoded'] = test_solution.genres.map(genre_mapping)
test_solution.drop(['movie_title', 'index', 'plot_summary', 'genres'], axis=1, inplace=True)
test_solution.head()

Unnamed: 0,genres_encoded
0,6
1,3
2,2
3,1
4,1


In [5]:
train.genres_encoded.value_counts()

genres_encoded
1     13613
2     13096
3      7447
4      5073
5      2204
6      1591
7      1315
8      1032
9       884
10      784
11      775
12      731
13      672
14      647
15      590
16      505
17      498
18      432
19      391
20      323
21      319
22      277
23      265
24      243
25      194
26      181
27      132
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(train['clean_plot_summary'], train['genres_encoded'], test_size=0.25, random_state=47, stratify=train.genres_encoded) #25% of data to be used as the test set

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization of Training set and Testing set
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
from sklearn.naive_bayes import MultinomialNB

# Using Naive Bayes classifier
naiveBayes_model = MultinomialNB()
naiveBayes_model.fit(X_train_tfidf, Y_train)

In [9]:
from sklearn.metrics import accuracy_score, classification_report

y_predicted = naiveBayes_model.predict(X_test_tfidf)
accuracy = accuracy_score(Y_test, y_predicted)
print(f'Accuracy: {accuracy:.2f}')

classification_rep = classification_report(Y_test, y_predicted, zero_division=1)
print('Classification Report:\n', classification_rep)

Accuracy: 0.44
Classification Report:
               precision    recall  f1-score   support

           1       0.38      0.88      0.53      3403
           2       0.53      0.90      0.66      3274
           3       0.71      0.04      0.07      1862
           4       1.00      0.00      0.00      1268
           5       1.00      0.00      0.00       551
           6       1.00      0.00      0.00       398
           7       1.00      0.00      0.00       329
           8       1.00      0.00      0.00       258
           9       1.00      0.00      0.00       221
          10       1.00      0.00      0.00       196
          11       1.00      0.00      0.00       194
          12       1.00      0.00      0.00       183
          13       1.00      0.00      0.00       168
          14       1.00      0.00      0.00       162
          15       1.00      0.00      0.00       148
          16       1.00      0.00      0.00       126
          17       1.00      0.00      0.0

In [10]:
# Testing Naive Bayes model on test data set
test_tfidf = tfidf_vectorizer.transform(test['clean_plot_summary'])
predicted_result = naiveBayes_model.predict(test_tfidf)
test_solution['genres_encoded'] = test_solution['genres_encoded'].astype('int64')

accuracy = accuracy_score(test_solution['genres_encoded'], predicted_result)
print(f'Accuracy of Tested solution: {accuracy:.2f}')
classification_rep = classification_report(test_solution['genres_encoded'], predicted_result, zero_division=1)
print('Classification Report:\n', classification_rep)

Accuracy of Tested solution: 0.44
Classification Report:
               precision    recall  f1-score   support

           1       0.38      0.88      0.53     13612
           2       0.53      0.90      0.66     13096
           3       0.67      0.04      0.07      7446
           4       1.00      0.00      0.01      5072
           5       1.00      0.00      0.00      2204
           6       1.00      0.00      0.00      1590
           7       1.00      0.00      0.00      1314
           8       1.00      0.00      0.00      1032
           9       1.00      0.00      0.00       883
          10       1.00      0.00      0.00       783
          11       1.00      0.00      0.00       775
          12       1.00      0.00      0.00       731
          13       1.00      0.00      0.00       672
          14       1.00      0.00      0.00       646
          15       1.00      0.00      0.00       590
          16       1.00      0.00      0.00       505
          17       1.00

In [11]:
from sklearn.linear_model import LogisticRegression

# Using Logistic Regression classifier
logistic_regression_model = LogisticRegression(max_iter=400)
logistic_regression_model.fit(X_train_tfidf, Y_train)

In [12]:
from sklearn.metrics import accuracy_score, classification_report
y_predicted = logistic_regression_model.predict(X_test_tfidf)
accuracy = accuracy_score(Y_test, y_predicted)
print(f'Accuracy: {accuracy:.2f}')

classification_rep = classification_report(Y_test, y_predicted, zero_division=1)
print('Classification Report:\n', classification_rep)

Accuracy: 0.57
Classification Report:
               precision    recall  f1-score   support

           1       0.53      0.80      0.64      3403
           2       0.64      0.88      0.74      3274
           3       0.53      0.58      0.55      1862
           4       0.48      0.24      0.32      1268
           5       0.66      0.50      0.57       551
           6       0.33      0.09      0.14       398
           7       0.52      0.18      0.26       329
           8       0.94      0.66      0.77       258
           9       0.49      0.10      0.17       221
          10       0.37      0.04      0.07       196
          11       0.79      0.13      0.23       194
          12       0.75      0.37      0.49       183
          13       0.00      0.00      1.00       168
          14       0.64      0.18      0.28       162
          15       0.84      0.21      0.34       148
          16       0.50      0.01      0.02       126
          17       0.83      0.04      0.0

In [13]:
# Testing Logistic Regression model on test data set
test_tfidf = tfidf_vectorizer.transform(test['clean_plot_summary'])
predicted_result = logistic_regression_model.predict(test_tfidf)

accuracy = accuracy_score(test_solution['genres_encoded'], predicted_result)
print(f'Accuracy of Tested solution: {accuracy:.2f}')
classification_rep = classification_report(test_solution['genres_encoded'], predicted_result, zero_division=1)
print('Classification Report:\n', classification_rep)

Accuracy of Tested solution: 0.58
Classification Report:
               precision    recall  f1-score   support

           1       0.52      0.80      0.63     13612
           2       0.64      0.88      0.74     13096
           3       0.53      0.57      0.55      7446
           4       0.52      0.29      0.37      5072
           5       0.66      0.54      0.59      2204
           6       0.38      0.09      0.14      1590
           7       0.56      0.21      0.31      1314
           8       0.93      0.66      0.77      1032
           9       0.49      0.09      0.16       883
          10       0.60      0.04      0.08       783
          11       0.75      0.12      0.21       775
          12       0.71      0.36      0.48       731
          13       0.30      0.00      0.01       672
          14       0.63      0.17      0.27       646
          15       0.59      0.15      0.24       590
          16       0.40      0.01      0.02       505
          17       0.30

In [14]:
from sklearn.svm import SVC

# Using support vector classifier
SVM_model = SVC(random_state=1)
SVM_model.fit(X_train_tfidf, Y_train)

In [15]:
from sklearn.metrics import accuracy_score, classification_report
y_predicted = SVM_model.predict(X_test_tfidf)
accuracy = accuracy_score(Y_test, y_predicted)
print(f'Accuracy: {accuracy:.2f}')

classification_rep = classification_report(Y_test, y_predicted, zero_division=1)
print('Classification Report:\n', classification_rep)

Accuracy: 0.56
Classification Report:
               precision    recall  f1-score   support

           1       0.49      0.84      0.62      3403
           2       0.62      0.88      0.73      3274
           3       0.54      0.52      0.53      1862
           4       0.58      0.19      0.28      1268
           5       0.73      0.48      0.58       551
           6       0.29      0.04      0.07       398
           7       0.61      0.09      0.16       329
           8       0.95      0.62      0.75       258
           9       0.58      0.03      0.06       221
          10       0.44      0.02      0.04       196
          11       0.75      0.09      0.17       194
          12       0.87      0.30      0.44       183
          13       1.00      0.00      0.00       168
          14       0.64      0.11      0.19       162
          15       0.80      0.16      0.27       148
          16       1.00      0.00      0.00       126
          17       1.00      0.01      0.0

In [16]:
# Testing SVM model on test data set
test_tfidf = tfidf_vectorizer.transform(test['clean_plot_summary'])
predicted_result = SVM_model.predict(test_tfidf)

accuracy = accuracy_score(test_solution['genres_encoded'], predicted_result)
print(f'Accuracy of Tested solution: {accuracy:.2f}')
classification_rep = classification_report(test_solution['genres_encoded'], predicted_result, zero_division=1)
print('Classification Report:\n', classification_rep)

Accuracy of Tested solution: 0.56
Classification Report:
               precision    recall  f1-score   support

           1       0.49      0.84      0.62     13612
           2       0.63      0.88      0.73     13096
           3       0.55      0.52      0.53      7446
           4       0.63      0.22      0.33      5072
           5       0.71      0.49      0.58      2204
           6       0.42      0.04      0.08      1590
           7       0.71      0.11      0.20      1314
           8       0.93      0.64      0.76      1032
           9       0.58      0.03      0.06       883
          10       0.63      0.03      0.06       783
          11       0.78      0.13      0.22       775
          12       0.82      0.24      0.38       731
          13       1.00      0.00      0.00       672
          14       0.55      0.09      0.15       646
          15       0.69      0.13      0.21       590
          16       1.00      0.01      0.01       505
          17       0.25