In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

# Load the dataset
df = pd.read_csv("movies.csv")

# Display the first few rows of the dataframe
print("First few rows of the dataset:")
print(df.head())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Drop rows with missing target values
df.dropna(subset=['genre'], inplace=True)

# Define features and target
X = df[['duration', 'language', 'average_rating', 'number_of_reviews', 'year', 'budget', 'revenue']]
y = df['genre']

# One-hot encode categorical features and scale numerical features
numeric_features = ['duration', 'average_rating', 'number_of_reviews', 'year', 'budget', 'revenue']
categorical_features = ['language']

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Create a pipeline with preprocessing and classifier
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train the classifier
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate using classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

First few rows of the dataset:
   duration language  average_rating  number_of_reviews  year    budget  \
0       120  English             7.5               1200  2020  10000000   
1       150  Spanish             6.8                800  2019   8000000   
2        90  English             8.0               1500  2021  12000000   
3       110   French             7.2                900  2018   9000000   
4       140  Spanish             6.9                600  2017   7000000   

    revenue   genre  
0  50000000  Action  
1  30000000  Comedy  
2  60000000  Action  
3  40000000   Drama  
4  25000000  Comedy  

Missing values in each column:
duration             0
language             0
average_rating       0
number_of_reviews    0
year                 0
budget               0
revenue              0
genre                0
dtype: int64

Classification Report:
              precision    recall  f1-score   support

      Action       1.00      1.00      1.00         1
      Comedy       1.00 