In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
train_data = pd.read_csv("Genre Classification Dataset/train_data.txt", sep=':::', names=["title", "genre", "description"], engine='python')
test_data = pd.read_csv("Genre Classification Dataset/test_data.txt", sep=':::', names=["title", "description"], engine='python')

In [3]:
train_data.head()

Unnamed: 0,title,genre,description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [4]:
test_data.head()

Unnamed: 0,title,description
1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),Before he was known internationally as a mart...


In [5]:
X_train = train_data['description'].astype(str).values
y_train = train_data['genre'].astype(str).values


X_test = test_data['description'].astype(str).values

In [6]:

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Logistic Regression

In [7]:

logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_split, y_train_split)


val_predictions = logreg_model.predict(X_val_split)

accuracy = accuracy_score(y_val_split, val_predictions)
print(f"Validation Accuracy: {accuracy:.2f}")

test_predictions = logreg_model.predict(X_test_tfidf)

test_data['predicted_genre'] = test_predictions
test_data.to_csv('predicted_genres_lr.csv', index=False)

Validation Accuracy: 0.59


In [8]:

solution_data = pd.read_csv("Genre Classification Dataset/test_data_solution.txt", sep=':::', names=["ID", "TITLE", "ACTUAL_GENRE", "DESCRIPTION"], engine='python')


merged_data = pd.merge(test_data, solution_data, left_index=True, right_index=True)

accuracy_solution = accuracy_score(merged_data['ACTUAL_GENRE'], merged_data['predicted_genre'])
print(f"Accuracy compared to solution data: {accuracy_solution:.2f}")

merged_data.to_csv('merged_data_with_predictions_lr.csv', index=False)


Accuracy compared to solution data: 0.19


# Naive Bayes

In [9]:

naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_split, y_train_split)

val_predictions_nb = naive_bayes_model.predict(X_val_split)

accuracy_nb = accuracy_score(y_val_split, val_predictions_nb)
print(f"Validation Accuracy (Naive Bayes): {accuracy_nb:.2f}")

test_predictions_nb = naive_bayes_model.predict(X_test_tfidf)


test_data['predicted_genre_nb'] = test_predictions_nb
test_data.to_csv('predicted_genres_nb.csv', index=False)

Validation Accuracy (Naive Bayes): 0.51


In [10]:

solution_data = pd.read_csv("Genre Classification Dataset/test_data_solution.txt", sep=':::', names=["ID", "TITLE", "ACTUAL_GENRE", "DESCRIPTION"], engine='python')


merged_data = pd.merge(test_data, solution_data, left_index=True, right_index=True)


accuracy_solution = accuracy_score(merged_data['ACTUAL_GENRE'], merged_data['predicted_genre_nb'])
print(f"Accuracy compared to solution data: {accuracy_solution:.2f}")


merged_data.to_csv('merged_data_with_predictions_nb.csv', index=False)


Accuracy compared to solution data: 0.23


# Linear Support Vector Machine

In [11]:

svm_model = LinearSVC()
svm_model.fit(X_train_split, y_train_split)


val_predictions_svm = svm_model.predict(X_val_split)


accuracy_svm = accuracy_score(y_val_split, val_predictions_svm)
print(f"Validation Accuracy (SVM): {accuracy_svm:.2f}")


test_predictions_svm = svm_model.predict(X_test_tfidf)


test_data['predicted_genre_svm'] = test_predictions_svm
test_data.to_csv('predicted_genres_svm.csv', index=False)



Validation Accuracy (SVM): 0.58


In [12]:

solution_data = pd.read_csv("Genre Classification Dataset/test_data_solution.txt", sep=':::', names=["ID", "TITLE", "ACTUAL_GENRE", "DESCRIPTION"], engine='python')


merged_data = pd.merge(test_data, solution_data, left_index=True, right_index=True)


accuracy_solution = accuracy_score(merged_data['ACTUAL_GENRE'], merged_data['predicted_genre_svm'])
print(f"Accuracy compared to solution data: {accuracy_solution:.2f}")


merged_data.to_csv('merged_data_with_predictions_svm.csv', index=False)


Accuracy compared to solution data: 0.18
