# Movie Genre Classification 

### Vonteri Varshith Reddy

### Imports

In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB

### Loading the Dataset

In [2]:
delimiter = ":::"
train_file_path = "./train_data.txt"
test_file_path  = "./test_data.txt"
test_data_solution_path = "./test_data_solution.txt"

In [3]:
# Load train dataset
train_data = pd.read_csv(train_file_path,delimiter = delimiter,header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

train_data.head()


  train_data = pd.read_csv(train_file_path,delimiter = delimiter,header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])


Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [4]:
test_data = pd.read_csv(test_data_solution_path,delimiter = delimiter,header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

test_data.head()

  test_data = pd.read_csv(test_data_solution_path,delimiter = delimiter,header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])


Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...


### Lemmetize

In [5]:
lemmatizer = WordNetLemmatizer()

# Function to preprocess the movie description 
# It converts the description to lowercase, removes punctuation and then removes stopwords
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text) 
    stop_words = set(stopwords.words('english')) 
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words] 
    return words

def word_averaging(words, model):
    word_vector = []
    for word in words:
        if word in model.wv:
            word_vector.append(model.wv[word])
    return np.mean(word_vector, axis=0) if len(word_vector) > 0 else np.zeros(model.vector_size)


### Extracting Preprocessed Description

In [6]:
train_data['Processed_Description'] = train_data['DESCRIPTION'].apply(lambda x: preprocess_text(str(x)))
test_data['Processed_Description'] = test_data['DESCRIPTION'].apply(lambda x: preprocess_text(str(x)))

train_data['Year'] = train_data['TITLE'].str.extract(r'\((\d{4})\)', expand=False)
train_data['Year'] = pd.to_numeric(train_data['Year'], errors='coerce')
test_data['Year'] = test_data['TITLE'].str.extract(r'\((\d{4})\)', expand=False)
test_data['Year'] = pd.to_numeric(test_data['Year'], errors='coerce')



### Vectorizing the data using Word2Vec Vectorizer

In [7]:
word2vec_model = Word2Vec(sentences=train_data['Processed_Description'], vector_size=100, window=5, min_count=1, workers=4)

In [10]:
train_data['Description_Vector'] = train_data['Processed_Description'].apply(lambda x: word_averaging(x, word2vec_model))
test_data['Description_Vector'] = test_data['Processed_Description'].apply(lambda x: word_averaging(x, word2vec_model))


X_train = pd.DataFrame(list(train_data['Description_Vector']))
y_train = train_data['GENRE']
X_train.dropna(inplace=True)

# Substituting inf values with NaNs(Not A Number's) and then drop NaN values from X_train
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.dropna(inplace=True)

# Fitting the Data to Logistic Regression Model
LR = LogisticRegression(max_iter=10000)
LR.fit(X_train, y_train)


X_test = pd.DataFrame(list(test_data['Description_Vector']))

# Drop rows with missing values in X_test
X_test.dropna(inplace=True)

# Substituting inf values with NaNs(Not A Number's) and then drop NaN values from X_test
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
indices_to_drop = X_test.index[X_test.isnull().any(axis=1)].tolist()
X_test.dropna(inplace=True)
y_test = test_data['GENRE']

# Predict genres for test data
y_pred = LR.predict(X_test)

# Remove corresponding rows from y_test
y_test_filtered = y_test.drop(indices_to_drop)



### Test Accuracy

In [11]:
# Calculate accuracy
accuracy = accuracy_score(y_test_filtered, y_pred)

accuracy = accuracy* 100
print(f"Accuracy: {accuracy}")

Accuracy: 53.6660516605166
