In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


def load_dataset(file_path, is_train=True):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if is_train:
                if len(parts) == 4:
                    data.append({'ID': parts[0], 'Title': parts[1], 'Genre': parts[2], 'Description': parts[3]})
            else:
                if len(parts) == 3:
                    data.append({'ID': parts[0], 'Title': parts[1], 'Description': parts[2]})
    return pd.DataFrame(data)


train_df = load_dataset(r'C:\Users\prath\Downloads\archive (3)\Genre Classification Dataset\train_data.txt', is_train=True)
test_df = load_dataset(r'C:\Users\prath\Downloads\archive (3)\Genre Classification Dataset\test_data.txt', is_train=False)


print("Train Dataset Head:")
print(train_df.head())


def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

train_df['cleaned_description'] = train_df['Description'].apply(clean_text)

test_df['cleaned_description'] = test_df['Description'].apply(clean_text)


from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_df['Genre'] = train_df['Genre'].apply(lambda x: x.split(','))
y = mlb.fit_transform(train_df['Genre'])


vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(train_df['cleaned_description'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


classifier = OneVsRestClassifier(LogisticRegression())
classifier.fit(X_train, y_train)


y_pred = classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Train Dataset Head:
  ID                             Title     Genre  \
0  1      Oscar et la dame rose (2009)     drama   
1  2                      Cupid (1997)  thriller   
2  3  Young, Wild and Wonderful (1980)     adult   
3  4             The Secret Sin (1915)     drama   
4  5            The Unrecovered (2007)     drama   

                                         Description  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
Accuracy: 0.3487964585446832
Classification Report:
              precision    recall  f1-score   support

      action       0.76      0.05      0.09       263
       adult       0.90      0.08      0.15       112
   adventure       0.57      0.03      0.05       139
   animation       0.00      0.00      0.00       104
   biography  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
