# K-drama recommendation system

In [3]:
import pandas as pd 
import numpy as np

df = pd.read_csv("k-drama-data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          100 non-null    object 
 1   Year          100 non-null    int64  
 2   Rating        100 non-null    float64
 3   Watchers      100 non-null    object 
 4   Episodes      100 non-null    int64  
 5   Ep. Duration  100 non-null    object 
 6   Aired On      100 non-null    object 
 7   Genres        100 non-null    object 
 8   Tags          100 non-null    object 
 9   Main Actors   100 non-null    object 
dtypes: float64(1), int64(2), object(7)
memory usage: 7.9+ KB


In [4]:
# Data Loading and Initial Exploration

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval

# load the dataset
df.columns

Index(['Name', 'Year', 'Rating', 'Watchers', 'Episodes', 'Ep. Duration',
       'Aired On', 'Genres', 'Tags', 'Main Actors'],
      dtype='object')

In [5]:
# Data Cleaning and Preprocessing

from ast import literal_eval
import pandas as pd

# clean numerical columns
df['Watchers'] = df['Watchers'].astype(str).str.replace(',', '').replace('nan', '0')
df['Watchers'] = pd.to_numeric(df['Watchers'], errors='coerce')

df['Ep. Duration'] = df['Ep. Duration'].astype(str).str.replace(' min', '').replace('nan', '0')
df['Ep. Duration'] = pd.to_numeric(df['Ep. Duration'], errors='coerce')

# Safe function to handle literal_eval errors
def safe_literal_eval(val):
    if pd.isna(val) or val == '' or str(val).lower() == 'nan':
        return []
    try:
         
        if isinstance(val, str) and val.startswith('[') and val.endswith(']'):
            return literal_eval(val)
        else:
            return []
    except (ValueError, SyntaxError):
        return []

# convert string lists to actual lists safely
df['Genres'] = df['Genres'].apply(safe_literal_eval)
df['Tags'] = df['Tags'].apply(safe_literal_eval)
df['Main Actors'] = df['Main Actors'].apply(safe_literal_eval)

# categorical features
df['Aired On'] = df['Aired On'].astype('category')

# check for missing values
print(df.isnull().sum())

Name              0
Year              0
Rating            0
Watchers          0
Episodes          0
Ep. Duration    100
Aired On          0
Genres            0
Tags              0
Main Actors       0
dtype: int64


In [6]:
# Feature Engineering

# Create a combined text feature for content-based filtering
df['Combined_Features'] = df.apply(lambda row: 
    ' '.join(row['Genres']) + ' ' + 
    ' '.join(row['Tags']) + ' ' + 
    ' '.join(row['Main Actors']) + ' ' + 
    row['Aired On'], axis=1)

# Create popularity score based on rating and watchers
df['Popularity_Score'] = (df['Rating'] * 0.7) + (df['Watchers'] * 0.3)

In [7]:
# Vectorization and Similarity Matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df['Combined_Features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a reverse mapping of indices and drama titles
indices = pd.Series(df.index, index=df['Name']).drop_duplicates()

In [8]:
# Building the Recommendation Function

def get_recommendations(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # get the index of the drama that matches the title
    idx = indices[title]
    
    # get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
                      
    # sort the dramas based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
    # get the scores of the 10 most similar dramas
    sim_scores = sim_scores[1:11]
    
    # get the drama indices 
    drama_indices = [i[0] for i in sim_scores]
    
    # return the top 10 most similar dramas
    return df[['Name', 'Genres', 'Rating', 'Popularity_Score']].iloc[drama_indices]

# test the function
print(get_recommendations('Crash Landing on You'))

                       Name Genres  Rating  Popularity_Score
6      Crash Landing on You     []     9.0          62680.20
8            Hotel del Luna     []     8.6          55085.42
11                 Vincenzo     []     8.9          51525.23
29     Hometown Cha-Cha-Cha     []     8.7          39627.39
31         Alchemy of Souls     []     9.1          38712.37
38  Romance Is a Bonus Book     []     8.1          35726.97
46   Twenty-Five Twenty-One     []     8.8          33850.36
48                 Start-Up     []     8.1          33564.57
52      The Uncanny Counter     []     8.8          32388.16
56            Secret Garden     []     8.2          31884.34


In [9]:
# Hybrid Recommendation System(Content and Popularity)

def hybrid_recommendations(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Get content_based recommendations
    content_recs = get_recommendations(title, cosine_sim, df, indices)
    
    # Get popularity scores for these recommendations
    content_recs = content_recs.merge(
        df[['Name', 'Popularity_Score']],
        on='Name',
        how='left'
    )
    
    # Sort by a combination of similarity and popularity
    content_recs['Hybrid_Score'] = content_recs['Popularity_Score_y'] * 0.3 + content_recs['Rating'] * 0.7
    content_recs = content_recs.sort_values('Hybrid_Score', ascending=False)
    
    return content_recs.head(10)

# Test the hybrid function
print(hybrid_recommendations('Itaewon Class'))

                              Name Genres  Rating  Popularity_Score_x  \
0        Strong Woman Do Bong Soon     []     8.7            71937.99   
1                    Itaewon Class     []     8.4            45301.98   
2          My ID is Gangnam Beauty     []     7.8            43178.16   
3                       Reply 1988     []     9.1            37040.17   
4                           The K2     []     8.2            36201.34   
5  Cinderella and the Four Knights     []     7.8            35631.06   
6        The King: Eternal Monarch     []     8.1            34069.17   
7                      Oh My Ghost     []     8.3            30868.91   
8                         My Demon     []     8.2            30139.54   
9                         Tomorrow     []     8.8            29892.76   

   Popularity_Score_y  Hybrid_Score  
0            71937.99     21587.487  
1            45301.98     13596.474  
2            43178.16     12958.908  
3            37040.17     11118.421  
4     

In [10]:
# Evaluation Metrics

from sklearn.model_selection import train_test_split

# Evaluation based on genre matching
def evaluate_recommendations(title, recommendations):
    # Get genres of the input drama
    input_genres = set(df[df['Name'] == title]['Genres'].iloc[0])
    
    if not input_genres:
        return 0.0
    
    # Calculate genre overlap for recommendations
    genre_overlap = []
    for _, row in recommendations.iterrows():
        rec_genres = set(row['Genres'])
        overlap = len(input_genres.intersection(rec_genres)) / len(input_genres.union(rec_genres))
        genre_overlap.append(genre_overlap)
        
        return np.mean(genre_overlap)

# Test evaluation
recs = hybrid_recommendations('Crash Landing on You')
print(f"Average genre overlap: {evaluate_recommendations('Crash Landing on You', recs):.2f}")

Average genre overlap: 0.00


In [11]:
# Test with debug mode on
print("Testing with 'Crash Landing on You':")
recs = hybrid_recommendations('Crash Landing on You')
overlap_score = evaluate_recommendations('Crash Landing on You', recs)
print(f"\nFinal Average Genre Overlap: {overlap_score:.2f}")

# Test edge cases
print("\nTesting edge cases:")
empty_recs = pd.DataFrame([{'Name': 'Test', 'Genres': []}])
print(f"Empty genres: {evaluate_recommendations('Crash Landing on You', empty_recs):.2f}")

Testing with 'Crash Landing on You':

Final Average Genre Overlap: 0.00

Testing edge cases:
Empty genres: 0.00


In [12]:
# Save the model to pkl

import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# Save all necessary components to a dictionary
model_components = {
    'cosine_sim': cosine_sim,   
    'df': df,                   
    'indices': indices,         
    'tfidf': tfidf  
}

# Save to pickle file
with open('kdrama_recommender.pkl', 'wb') as f:
    pickle.dump(model_components, f)

print("Model saved successfully to kdrama_recommender.pkl")

Model saved successfully to kdrama_recommender.pkl
