# Importing Libraries

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
from nltk.tokenize import word_tokenize
import gensim.downloader as api
from gensim.models import Word2Vec


In [101]:
train_data = pd.read_csv('/content/train_data.txt', sep= ':::', engine= 'python', names= ['ID','TITLE', 'GENRE', 'DESCRIPTION'])
train_data.head()
train_data.shape

(54214, 4)

#Using word embedding technique

In [22]:
# Combine TITLE and DESCRIPTION into one text field
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['DESCRIPTION']

# Define a function to clean the text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

# Apply text cleaning
train_data['TEXT'] = train_data['TEXT'].apply(clean_text)

import nltk
nltk.download('punkt')

# Tokenize the text
train_data['TOKENIZED_TEXT'] = train_data['TEXT'].apply(word_tokenize)

# Separate features and labels
X = train_data['TOKENIZED_TEXT']
y = train_data['GENRE']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
word2vec_model = api.load('word2vec-google-news-300')
def document_vector(word2vec_model, doc):
    # Remove out-of-vocabulary words and get word vectors
    word_vectors = []
    for word in doc:
        if word in word2vec_model:
            word_vectors.append(word2vec_model[word])
    # If there are no words with embeddings, return zeros
    if not word_vectors:
        return np.zeros(word2vec_model.vector_size)
    # Compute the average word vector
    return np.mean(word_vectors, axis=0)

# Convert training and validation data to document vectors
X_train_w2v = np.array([document_vector(word2vec_model, doc) for doc in X_train])
X_val_w2v = np.array([document_vector(word2vec_model, doc) for doc in X_val])



In [44]:
# Train a Logistic Regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_w2v, y_train)

# Make predictions on the validation set
y_pred = classifier.predict(X_val_w2v)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_val, y_pred))

Accuracy: 0.5707


  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

      action        0.46      0.28      0.35       263
       adult        0.60      0.24      0.34       112
   adventure        0.33      0.10      0.15       139
   animation        0.38      0.05      0.09       104
   biography        0.00      0.00      0.00        61
      comedy        0.51      0.59      0.55      1443
       crime        0.50      0.04      0.07       107
 documentary        0.65      0.83      0.73      2659
       drama        0.53      0.77      0.63      2697
      family        0.55      0.11      0.19       150
     fantasy        0.00      0.00      0.00        74
   game-show        1.00      0.33      0.49        40
     history        0.00      0.00      0.00        45
      horror        0.61      0.55      0.58       431
       music        0.58      0.47      0.52       144
     musical        1.00      0.02      0.04        50
     mystery        0.00      0.00      0.00        56
        n

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Using TF-IDF technique

In [11]:
train_data['TEXT'] = train_data['TITLE'] + ' ' + train_data['DESCRIPTION']

# Define a function to clean the text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

# Apply text cleaning
train_data['TEXT'] = train_data['TEXT'].apply(clean_text)


X = train_data['TEXT']
y = train_data['GENRE']


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.7)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)


# Using Naive Bayes classifier

In [12]:
# Create a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the model on the training data
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred = nb_classifier.predict(X_val_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_val, y_pred))


Accuracy: 0.5207
               precision    recall  f1-score   support

      action        0.88      0.08      0.15       177
       adult        0.67      0.03      0.05        77
   adventure        0.55      0.05      0.10       112
   animation        0.00      0.00      0.00        65
   biography        0.00      0.00      0.00        44
      comedy        0.53      0.42      0.47      1075
       crime        0.00      0.00      0.00        77
 documentary        0.56      0.89      0.69      1855
       drama        0.46      0.85      0.59      1922
      family        0.00      0.00      0.00        99
     fantasy        0.00      0.00      0.00        57
   game-show        1.00      0.07      0.13        28
     history        0.00      0.00      0.00        28
      horror        0.73      0.28      0.41       308
       music        1.00      0.03      0.07       115
     musical        0.00      0.00      0.00        23
     mystery        0.00      0.00      0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Using LR

In [63]:
# Initialize Logistic Regression classifier
clf = LogisticRegression(max_iter=1000)

# Train the classifier
clf.fit(X_train_tfidf, y_train)

# Predict on validation set
y_pred = clf.predict(X_val_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Print classification report
print(classification_report(y_val, y_pred))


Accuracy: 0.5847


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

      action        0.50      0.25      0.33       263
       adult        0.72      0.23      0.35       112
   adventure        0.46      0.15      0.23       139
   animation        0.64      0.09      0.15       104
   biography        0.00      0.00      0.00        61
      comedy        0.52      0.58      0.55      1443
       crime        0.43      0.03      0.05       107
 documentary        0.67      0.85      0.75      2659
       drama        0.54      0.78      0.64      2697
      family        0.40      0.08      0.13       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.95      0.45      0.61        40
     history        0.00      0.00      0.00        45
      horror        0.67      0.60      0.63       431
       music        0.61      0.48      0.54       144
     musical        0.67      0.04      0.08        50
     mystery        0.00      0.00      0.00        56
        n

  _warn_prf(average, modifier, msg_start, len(result))


In [64]:
train_accuracy = clf.score(X_train_tfidf, y_train)

# Calculate validation accuracy
val_accuracy = clf.score(X_val_tfidf, y_val)

print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')

Training Accuracy: 0.6748
Validation Accuracy: 0.5847


# Apply on test data set

In [65]:
df_test = pd.read_csv('/content/test_data.txt', sep= ':::', engine= 'python', names= ['ID','TITLE', 'DESCRIPTION'])
df_test.head()

Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [66]:
df_test.shape

(54200, 3)

In [67]:
df_test['TEXT'] = df_test['TITLE'] + " " + df_test['DESCRIPTION']

# Transform test data using the already fitted TF-IDF vectorizer
X_test_tfidf = tfidf_vectorizer.transform(df_test['TEXT'])

# Predict using the trained Logistic Regression model
y_pred_test = clf.predict(X_test_tfidf)

df_test['PREDICTED_GENRE'] = y_pred_test


In [68]:
df_test.shape

(54200, 5)

In [69]:
df_test.head(10)

Unnamed: 0,ID,TITLE,DESCRIPTION,TEXT,PREDICTED_GENRE
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar...",Edgar's Lunch (1998) L.R. Brane loves his l...,short
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch...","La guerra de papá (1977) Spain, March 1964:...",drama
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...,Off the Beaten Track (2010) One year in the...,documentary
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi...","Meu Amigo Hindu (2015) His father has died,...",drama
4,5,Er nu zhai (1955),Before he was known internationally as a mart...,Er nu zhai (1955) Before he was known inter...,drama
5,6,Riddle Room (2016),Emily Burns is being held captive in a room w...,Riddle Room (2016) Emily Burns is being hel...,short
6,7,L'amica (1969),The beautiful but neglected wife of a brillia...,L'amica (1969) The beautiful but neglected ...,drama
7,8,Ina Mina Dika (1989),Vasu Inamdar (Ina) suffers from a disorder wh...,Ina Mina Dika (1989) Vasu Inamdar (Ina) suf...,comedy
8,9,Equinox Special: Britain's Tornados (2005),An insight into the tornados that hit Kensal ...,Equinox Special: Britain's Tornados (2005) ...,documentary
9,10,Press (2011),Press is a story of young people overwhelmed ...,Press (2011) Press is a story of young peop...,drama


In [91]:
solution_data_path = '/content/test_data_solution.txt'
df_solution = pd.read_csv(solution_data_path, delimiter=' ::: ', engine='python', header=None, names=['ID','TITLE','GENRE','DESCRIPTION'])
df_solution.head(10)

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...
5,6,Riddle Room (2016),horror,Emily Burns is being held captive in a room wi...
6,7,L'amica (1969),drama,The beautiful but neglected wife of a brillian...
7,8,Ina Mina Dika (1989),comedy,Vasu Inamdar (Ina) suffers from a disorder whe...
8,9,Equinox Special: Britain's Tornados (2005),documentary,An insight into the tornados that hit Kensal R...
9,10,Press (2011),drama,Press is a story of young people overwhelmed b...


# Comparison with test data solution

In [117]:

df_test['ID'] = df_test['ID'].astype(str)
df_solution['ID'] = df_solution['ID'].astype(str)

# Merge on 'ID' and compare 'GENRE' columns
df_merged = pd.merge(df_test, df_solution, on='ID', suffixes=('_TEST', '_SOLUTION'))

# Compare genres
genre_comparison = df_merged[['ID', 'PREDICTED_GENRE', 'GENRE']]


In [118]:
df_test['PREDICTED_GENRE'] = df_test['PREDICTED_GENRE'].str.lower().str.strip()
df_solution['GENRE'] = df_solution['GENRE'].str.lower().str.strip()

# Calculate accuracy
accuracy = (df_test['PREDICTED_GENRE'] == df_solution['GENRE']).mean()
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5830
