**Load** **Data**

In [None]:
import pandas as pd

train_data = pd.read_csv('/content/train_data.txt', sep=' ::: ', engine='python', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
test_data = pd.read_csv('/content/test_data.txt', sep=' ::: ', engine='python', header=None, names=['ID', 'TITLE', 'DESCRIPTION'])
test_data_solution = pd.read_csv('/content/test_data_solution.txt', sep=' ::: ', engine='python', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

print('Train Data',train_data.head())

print('Test Data',test_data.head())

print('Solution',test_data_solution.head())

Train Data    ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
Test Data    ID                        TITLE  \
0   1         Edgar's Lunch (1998)   
1   2     La guerra de papá (1977)   
2   3  Off the Beaten Track (2010)   
3   4       Meu Amigo Hindu (2015)   
4   5            Er nu zhai (1955)   

                                         DESCRIPTION  
0  L.R. Brane loves his life - his car, his ap

**Preprocess**

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import nltk
nltk.download('punkt') # Trained on unlabel data
nltk.download('stopwords')

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)# Replaces all non-word characters in the text with a space.
    text = re.sub(r'\s+', ' ', text)# Replaces all sequences of one or more whitespace characters with a single space
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]# Creates a list of tokens excluding stopwords
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]#Reducing words to their base form
    return ' '.join(tokens)#Return preprocess data

train_data['clean_description'] = train_data['DESCRIPTION'].apply(preprocess_text)
test_data['clean_description'] = test_data['DESCRIPTION'].apply(preprocess_text)


print(train_data.head())



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


   ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  \
0  Listening in to a conversation between his doc...   
1  A brother and sister with a past incestuous re...   
2  As the bus empties the students for their fiel...   
3  To help their unemployed father make ends meet...   
4  The film's title refers not only to the un-rec...   

                                   clean_description  
0  listen convers doctor parent 10 year old oscar...  
1  brother sister past incestu relationship curre...  
2  bu empti student field trip museum natur histo...  
3  help unemploy father make end meet edith twin ...  
4  film titl refer un recov bodi ground zero also...  


**Feature** **Extraction**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_data['clean_description']).toarray()# converts the sparse matrix to a dense NumPy array.
y_train = train_data['GENRE']

X_test = tfidf.transform(test_data['clean_description']).toarray()

print(X_train.shape, X_test.shape)


(4762, 5000) (3260, 5000)


**Model** **Training**

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Save the model for future use
import joblib
joblib.dump(model, 'movie_genre_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [None]:
from sklearn.svm import SVC

# Initialize and train the SVM model
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)

# Save the SVM model
joblib.dump(svm_model, 'movie_genre_svm_model.pkl')


['movie_genre_svm_model.pkl']

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Save the Naive Bayes model
joblib.dump(nb_model, 'movie_genre_nb_model.pkl')


['movie_genre_nb_model.pkl']

**Model Evaluation**

In [None]:
# Check the sizes of test_data and test_data_solution
print(f'test_data size: {len(test_data)}')
print(f'test_data_solution size: {len(test_data_solution)}')


test_data size: 3260
test_data_solution size: 9623


**Align the Datasets**

In [None]:
# Merge test data and test data solution on ID to ensure alignment
merged_test_data = pd.merge(test_data, test_data_solution[['ID', 'GENRE']], on='ID')

# Extract the aligned test data and labels
X_test_aligned = tfidf.transform(merged_test_data['clean_description']).toarray()
y_test_aligned = merged_test_data['GENRE']

# Verify the sizes after alignment
print(f'Aligned X_test size: {len(X_test_aligned)}')
print(f'Aligned y_test size: {len(y_test_aligned)}')


Aligned X_test size: 3260
Aligned y_test size: 3260


**Model Prediction**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

# Evaluate Logistic Regression
print('Logistic Regression')
evaluate_model(model, X_test_aligned, y_test_aligned)

# Evaluate SVM
print('SVM')
evaluate_model(svm_model, X_test_aligned, y_test_aligned)

# Evaluate Naive Bayes
print('Naive Bayes')
evaluate_model(nb_model, X_test_aligned, y_test_aligned)


Logistic Regression


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.49662576687116566
Precision: 0.44580968888016137
Recall: 0.49662576687116566
F1 Score: 0.4109525038481308
SVM


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5174846625766871
Precision: 0.49678647569012213
Recall: 0.5174846625766871
F1 Score: 0.4614694027079877
Naive Bayes
Accuracy: 0.44355828220858895
Precision: 0.406917609414082
Recall: 0.44355828220858895
F1 Score: 0.31672143031083205


SVM appears to be the best choice among the three models, as it has the highest accuracy, precision, recall, and F1 score. This suggests that SVM provides the best balance between correctly identifying the relevant genres (recall) and avoiding incorrect classifications (precision).

In [None]:
def predict_genre(plot_summary, model, vectorizer):
    clean_plot = preprocess_text(plot_summary)
    features = vectorizer.transform([clean_plot]).toarray()
    genre = model.predict(features)
    return genre[0]

**Example**

In [None]:
new_plot = "A young boy discovers he has magical powers and attends a school for wizards."
predicted_genre = predict_genre(new_plot, svm_model, tfidf)
print(f'Predicted Genre: {predicted_genre}')


Predicted Genre: drama
