In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('netflix_titles.csv')

# Display the first 5 rows of the DataFrame
display(df.head())

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


Now, let's look at the general information about the dataset, including data types and non-null values.

In [None]:
# Display information about the DataFrame
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


None

# Task
Build a classification model to predict whether a Netflix title is a "Movie" or a "TV Show" using features such as title, description, cast, director, country, and listed_in from the `netflix_titles.csv` dataset, and evaluate its performance.

## Handle Missing Values

### Subtask:
Address missing values in columns like 'director', 'cast', and 'country' by filling them with a suitable placeholder, as these columns will be used for features.


**Reasoning**:
The subtask requires addressing missing values in 'director', 'cast', and 'country' columns. I will fill these missing values with 'Unknown' as specified in the instructions and then verify that there are no more missing values in these columns.



In [None]:
columns_to_fill = ['director', 'cast', 'country']

# Fill missing values with 'Unknown'
for col in columns_to_fill:
    df[col].fillna('Unknown', inplace=True)

# Verify that missing values have been handled
print("Missing values after filling:")
display(df[columns_to_fill].isnull().sum())

Missing values after filling:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


Unnamed: 0,0
director,0
cast,0
country,0


**Reasoning**:
The previous code produced a FutureWarning regarding the use of `inplace=True` with chained assignment. To address this and follow best practices, I will modify the code to directly assign the result of `fillna()` back to the DataFrame column, which achieves the same outcome without the warning.



In [None]:
columns_to_fill = ['director', 'cast', 'country']

# Fill missing values with 'Unknown'
for col in columns_to_fill:
    df[col] = df[col].fillna('Unknown')

# Verify that missing values have been handled
print("Missing values after filling:")
display(df[columns_to_fill].isnull().sum())

Missing values after filling:


Unnamed: 0,0
director,0
cast,0
country,0


## Feature Engineering and Preprocessing

### Subtask:
Combine text-based features such as 'title', 'description', 'cast', 'director', and 'listed_in' into a single feature, and apply TF-IDF vectorization. Also, prepare the 'country' column by applying TF-IDF vectorization, then combine all vectorized features.


**Reasoning**:
First, I need to combine the specified text-based features ('title', 'description', 'cast', 'director', and 'listed_in') into a single new column named `combined_text_features` in the DataFrame `df`. I will ensure all values are treated as strings before concatenation, separated by a space.



In [None]:
df['combined_text_features'] = df['title'].astype(str) + ' ' + \
                               df['description'].astype(str) + ' ' + \
                               df['cast'].astype(str) + ' ' + \
                               df['director'].astype(str) + ' ' + \
                               df['listed_in'].astype(str)

print("Combined text features created. Displaying the first few entries:")
display(df[['title', 'description', 'cast', 'director', 'listed_in', 'combined_text_features']].head())

Combined text features created. Displaying the first few entries:


Unnamed: 0,title,description,cast,director,listed_in,combined_text_features
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...",Unknown,Kirsten Johnson,Documentaries,Dick Johnson Is Dead As her father nears the e...
1,Blood & Water,"After crossing paths at a party, a Cape Town t...","Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",Unknown,"International TV Shows, TV Dramas, TV Mysteries","Blood & Water After crossing paths at a party,..."
2,Ganglands,To protect his family from a powerful drug lor...,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Julien Leclercq,"Crime TV Shows, International TV Shows, TV Act...",Ganglands To protect his family from a powerfu...
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...",Unknown,Unknown,"Docuseries, Reality TV","Jailbirds New Orleans Feuds, flirtations and t..."
4,Kota Factory,In a city of coaching centers known to train I...,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",Unknown,"International TV Shows, Romantic TV Shows, TV ...",Kota Factory In a city of coaching centers kno...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# Initialize and apply TF-IDF to combined_text_features
tfidf_vectorizer_text = TfidfVectorizer()
tfidf_matrix_text = tfidf_vectorizer_text.fit_transform(df['combined_text_features'])

# Initialize and apply TF-IDF to the 'country' column
tfidf_vectorizer_country = TfidfVectorizer()
tfidf_matrix_country = tfidf_vectorizer_country.fit_transform(df['country'])

# Horizontally stack the two TF-IDF matrices
X = hstack([tfidf_matrix_text, tfidf_matrix_country])

print("TF-IDF vectorization complete and features combined into matrix X.")
print(f"Shape of combined feature matrix X: {X.shape}")

TF-IDF vectorization complete and features combined into matrix X.
Shape of combined feature matrix X: (8807, 53351)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'type' column to create the target variable y
y = label_encoder.fit_transform(df['type'])

# Display the value counts of the encoded target variable
print("Value counts for the encoded 'type' column (y):")
print(pd.Series(y).value_counts())

# Display the mapping of original labels to encoded values
print("\nOriginal labels to encoded values mapping:")
for i, item in enumerate(label_encoder.classes_):
    print(f"{item}: {i}")

Value counts for the encoded 'type' column (y):
0    6131
1    2676
Name: count, dtype: int64

Original labels to encoded values mapping:
Movie: 0
TV Show: 1


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shapes of the resulting datasets
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (7045, 53351)
Shape of X_test: (1762, 53351)
Shape of y_train: (7045,)
Shape of y_test: (1762,)


In [None]:
from sklearn.linear_model import LogisticRegression

# Instantiate a Logistic Regression model
model = LogisticRegression(solver='liblinear', random_state=42)

# Train the model
model.fit(X_train, y_train)

print("Logistic Regression model trained successfully.")

Logistic Regression model trained successfully.


In [None]:
y_pred = model.predict(X_test)

print("Predictions made on the test set.")

Predictions made on the test set.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Generate and print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Generate and print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9841
Precision: 0.9980
Recall: 0.9495
F1-Score: 0.9732

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1227
           1       1.00      0.95      0.97       535

    accuracy                           0.98      1762
   macro avg       0.99      0.97      0.98      1762
weighted avg       0.98      0.98      0.98      1762


Confusion Matrix:
[[1226    1]
 [  27  508]]
