In [18]:
import pandas as pd

# Define a function to load the dataset
def load_train_data(file_path):
    data = pd.read_csv(file_path, delimiter=':::', engine='python', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
    return data

def load_test_data(file_path):
    data = pd.read_csv(file_path, delimiter=':::', engine='python', header=None, names=['ID', 'TITLE','DESCRIPTION'])
    return data

# Load train and test data
train_data = load_train_data('train_data.txt')  # Update the path to your actual train data file
test_data = load_test_data('test_data.txt')    # Update the path to your actual test data file

# Display the first few rows
print(train_data.head())
print(test_data.head())

   ID                               TITLE       GENRE  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         DESCRIPTION  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  
   ID                          TITLE  \
0   1          Edgar's Lunch (1998)    
1   2      La guerra de papá (1977)    
2   3   Off the Beaten Track (2010)    
3   4        Meu Amigo Hindu (2015)    
4   5             Er nu zhai (1955)    

                                         DESCRIPTION  
0   L.R. Brane loves his life -

In [19]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply cleaning to descriptions
train_data['DESCRIPTION'] = train_data['DESCRIPTION'].apply(clean_text)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(clean_text)

In [20]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the train data description
X_train = tfidf_vectorizer.fit_transform(train_data['DESCRIPTION'])

# Transform the test data description
X_test = tfidf_vectorizer.transform(test_data['DESCRIPTION'])

In [21]:
# Extract labels
y_train = train_data['GENRE']

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Initialize models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine': SVC()
}

# Train models
for name, model in models.items():
    model.fit(X_train, y_train_encoded)
    print(f'{name} trained')

Naive Bayes trained
Logistic Regression trained
Support Vector Machine trained


In [23]:
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train_encoded, cv=5, scoring='accuracy')
    print(f'{name} Accuracy: {scores.mean():.4f}')

Naive Bayes Accuracy: 0.5208
Logistic Regression Accuracy: 0.5774
