In [60]:
import pandas as pd
train_data = pd.read_csv('train_data.txt', sep=':::', names=['movie','genre', 'plot'])
test_data = pd.read_csv('test_data.txt', sep=':::', names=['movie','plot'])

In [61]:
train_data.head()

Unnamed: 0,movie,genre,plot
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [62]:
test_data.head()

Unnamed: 0,movie,plot
1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),Before he was known internationally as a mart...


In [63]:
import re
def clean_text(text):
    text = text.lower()
    # Remove special characters and punctuations
    text = re.sub(r"[^a-z0-9\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text
train_data['clean_plot'] = train_data['plot'].apply(clean_text)
train_data[['movie', 'genre', 'clean_plot']].head(10)  
test_data['clean_plot'] = test_data['plot'].apply(clean_text)
test_data[['movie', 'clean_plot']].head(10)

Unnamed: 0,movie,clean_plot
1,Edgar's Lunch (1998),lr brane loves his life his car his apartment ...
2,La guerra de papá (1977),spain march 1964 quico is a very naughty child...
3,Off the Beaten Track (2010),one year in the life of albin and his family o...
4,Meu Amigo Hindu (2015),his father has died he hasnt spoken with his b...
5,Er nu zhai (1955),before he was known internationally as a marti...
6,Riddle Room (2016),emily burns is being held captive in a room wi...
7,L'amica (1969),the beautiful but neglected wife of a brillian...
8,Ina Mina Dika (1989),vasu inamdar ina suffers from a disorder where...
9,Equinox Special: Britain's Tornados (2005),an insight into the tornados that hit kensal r...
10,Press (2011),press is a story of young people overwhelmed b...


In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_data['clean_plot'])
X_test = vectorizer.transform(test_data['clean_plot'])

In [65]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['genre'])

In [66]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [68]:
y_pred = model.predict(X_test)
predicted_genres = label_encoder.inverse_transform(y_pred)
test_data['predicted_genre'] = predicted_genres

In [69]:
test_data[['movie','plot','predicted_genre']].head()

Unnamed: 0,movie,plot,predicted_genre
1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar...",short
2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch...",drama
3,Off the Beaten Track (2010),One year in the life of Albin and his family ...,documentary
4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi...",drama
5,Er nu zhai (1955),Before he was known internationally as a mart...,biography


In [70]:
test_data[['movie', 'plot', 'predicted_genre']].to_csv('predicted_genres.csv', index=False)

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
model.fit(X_train_split, y_train_split)
y_val_pred = model.predict(X_val_split)
acc = accuracy_score(y_val_split, y_val_pred)
print(f"Validation Accuracy: {acc:.2f}")
print("\nClassification Report:")
print(classification_report(y_val_split, y_val_pred, target_names=label_encoder.classes_))

Validation Accuracy: 0.44

Classification Report:
               precision    recall  f1-score   support

      action        0.30      0.43      0.35       263
       adult        0.32      0.72      0.45       118
   adventure        0.20      0.35      0.26       155
   animation        0.17      0.35      0.23       100
   biography        0.02      0.09      0.04        53
      comedy        0.60      0.39      0.47      1490
       crime        0.14      0.39      0.21       101
 documentary        0.79      0.52      0.63      2619
       drama        0.71      0.34      0.46      2723
      family        0.14      0.32      0.20       157
     fantasy        0.08      0.22      0.11        65
   game-show        0.62      0.72      0.67        39
     history        0.06      0.27      0.10        49
      horror        0.56      0.65      0.60       441
       music        0.37      0.77      0.50       146
     musical        0.09      0.22      0.13        55
     mystery  