#### SVC Model

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

# Load the data
netflix_df = pd.read_csv('titles.csv')
netflix_movies_df = netflix_df[netflix_df['type'] == 'MOVIE'].copy()
christmas_df = pd.read_csv('christmas_movies.csv')

# Get the list of Christmas movie titles
christmas_movie_titles = christmas_df['title'].tolist()

# Add a column to indicate if the movie is a Christmas movie
netflix_movies_df['is_christmas'] = netflix_movies_df['title'].apply(
    lambda x: 1 if x in christmas_movie_titles else 0)

# Drop rows with missing values in 'title' or 'description'
netflix_movies_df = netflix_movies_df.dropna(subset=['description', 'title'])

# Use TF-IDF to transform both 'title' and 'description' into numerical features
text_transformer = TfidfVectorizer(stop_words='english')

# Prepare the features (X) and target (y)
X = netflix_movies_df[['title', 'description']]
y = netflix_movies_df['is_christmas']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model pipeline
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('title', text_transformer, 'title'),  # Apply TF-IDF on the title column
            ('description', text_transformer, 'description')  # Apply TF-IDF on the description column
        ]
    ),
    SVC(kernel='linear', random_state=42) )

# Train the model
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Classification report for precision, recall, f1-score, and support
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 99.87%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       744
           1       0.75      1.00      0.86         3

    accuracy                           1.00       747
   macro avg       0.88      1.00      0.93       747
weighted avg       1.00      1.00      1.00       747



#### Random Forest Model

In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

# Load the data
netflix_df = pd.read_csv('titles.csv')
netflix_movies_df = netflix_df[netflix_df['type'] == 'MOVIE'].copy()
christmas_df = pd.read_csv('christmas_movies.csv')

# Get the list of Christmas movie titles
christmas_movie_titles = christmas_df['title'].tolist()

# Add a column to indicate if the movie is a Christmas movie
netflix_movies_df['is_christmas'] = netflix_movies_df['title'].apply(
    lambda x: 1 if x in christmas_movie_titles else 0
)

# Drop rows with missing values in 'title' or 'description'
netflix_movies_df = netflix_movies_df.dropna(subset=['description', 'title'])

# Feature Engineering (Title and Description using TF-IDF)
text_transformer = TfidfVectorizer(stop_words='english')

# Prepare the features (X) and target (y)
X = netflix_movies_df[['title', 'description']]  # Using only title and description as features
y = netflix_movies_df['is_christmas']

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Build the Random Forest model pipeline
model_rf = make_pipeline(
    ColumnTransformer(
        transformers=[ 
            ('title', text_transformer, 'title'),  # Apply TF-IDF on the title column
            ('description', text_transformer, 'description')  # Apply TF-IDF on the description column
        ]
    ),
    RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)  # Random Forest with balanced class weights
)

# Step 4: Train the model
model_rf.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred_rf = model_rf.predict(X_test)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf * 100:.2f}%')

# Classification report for precision, recall, f1-score, and support
print('Random Forest Classification Report:')
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 99.60%
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       744
           1       0.00      0.00      0.00         3

    accuracy                           1.00       747
   macro avg       0.50      0.50      0.50       747
weighted avg       0.99      1.00      0.99       747



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
