In [56]:
# import libraries
import numpy as np
import pandas as pd

In [57]:
# import data
data = pd.read_csv('kdrama_list.csv')
data.head(1)

Unnamed: 0.1,Unnamed: 0,Name,Year,Genre,Main Cast,Sinopsis,Score,Content Rating,Tags,Network,img url,Episode
0,0,Move to Heaven,2021,"Life, Drama","Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...",Han Geu Roo is an autistic 20-year-old. He wor...,9.2,18+ Restricted (violence & profanity),"Uncle-Nephew Relationship,, Autism,, Death,, S...",Netflix,https://i.mydramalist.com/Rle36_4c.jpg?v=1,10 episodes


In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1647 entries, 0 to 1646
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      1647 non-null   int64  
 1   Name            1647 non-null   object 
 2   Year            1647 non-null   int64  
 3   Genre           1647 non-null   object 
 4   Main Cast       1647 non-null   object 
 5   Sinopsis        1642 non-null   object 
 6   Score           1647 non-null   float64
 7   Content Rating  1647 non-null   object 
 8   Tags            1628 non-null   object 
 9   Network         1647 non-null   object 
 10  img url         1647 non-null   object 
 11  Episode         1647 non-null   object 
dtypes: float64(1), int64(2), object(9)
memory usage: 154.5+ KB


In [59]:
# check missing values
data.isnull().sum()

Unnamed: 0         0
Name               0
Year               0
Genre              0
Main Cast          0
Sinopsis           5
Score              0
Content Rating     0
Tags              19
Network            0
img url            0
Episode            0
dtype: int64

In [60]:
# Drop missing values
data = data.dropna(subset=['Tags'])
#data = data.dropna(subset=['Tags','Sinopsis'])

In [61]:
# write a function on which we will select 1st two genre in each Genre convert it into a string
def genre(x):
    return x.split(',')[0:2]
# apply the function on Genre column
data['Genre'] = data['Genre'].apply(genre)



In [64]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,Name,Year,Genre,Main Cast,Sinopsis,Score,Content Rating,Tags,Network,img url,Episode
0,0,Move to Heaven,2021,"[Life, Drama]","Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...",Han Geu Roo is an autistic 20-year-old. He wor...,9.2,18+ Restricted (violence & profanity),"Uncle-Nephew Relationship,, Autism,, Death,, S...",Netflix,https://i.mydramalist.com/Rle36_4c.jpg?v=1,10 episodes


In [65]:
# convert Genre into string
data['Genre'] = data['Genre'].apply(lambda x: ', '.join(x))

In [66]:
data['Genre'][1]

'Action,  Youth'

In [67]:
# 2. Combine the `Genre` and `Tags` and 'Snopsis columns into a single feature for similarity
# We will also lower the text to avoid case sensitivity
#data['combined_features'] = data['Genre'].str.lower() + ' ' + data['Tags'].str.lower() + ' ' + data['Sinopsis'].str.lower()
data['combined_features'] = data['Genre'].str.lower() + ' ' + data['Tags'].str.lower()
data.head(2)

Unnamed: 0.1,Unnamed: 0,Name,Year,Genre,Main Cast,Sinopsis,Score,Content Rating,Tags,Network,img url,Episode,combined_features
0,0,Move to Heaven,2021,"Life, Drama","Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...",Han Geu Roo is an autistic 20-year-old. He wor...,9.2,18+ Restricted (violence & profanity),"Uncle-Nephew Relationship,, Autism,, Death,, S...",Netflix,https://i.mydramalist.com/Rle36_4c.jpg?v=1,10 episodes,"life, drama uncle-nephew relationship,, autis..."
1,1,Weak Hero Class 1,2022,"Action, Youth","Park Ji Hoon, Choi Hyun Wook, Hong Kyung, Kim ...",Yeon Shi Eun is a model student who ranks at t...,9.1,18+ Restricted (violence & profanity),"Smart Male Lead,, Bromance,, School Bullying,,...","Wavve, iQIYI, Viki",https://i.mydramalist.com/pq2lr_4c.jpg?v=1,8 episodes,"action, youth smart male lead,, bromance,, sc..."


In [30]:
data['combined_features'][0]

'life uncle-nephew relationship,, autism,, death,, savant syndrome,, mourning,, tearjerker,, life lesson,, cleaning and organizing,, autism spectrum disorder,, murder'

In [31]:
# 3. Convert the combined features into a matrix of token counts
# We'll use CountVectorizer to create a 'bag-of-words' model
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))  # split by commas and spaces
features_matrix = vectorizer.fit_transform(data['combined_features'])



In [32]:
# Preview the combined features and shape of the matrix
print("Combined Features Example:\n", data['combined_features'].head())
print("\nShape of the Features Matrix:", features_matrix.shape)

Combined Features Example:
 0    life uncle-nephew relationship,, autism,, deat...
1    action smart male lead,, bromance,, school bul...
2    romance multiple mains,, band,, music,, strong...
3    thriller deception,, family secret,, mystery,,...
4    romance nice male lead,, multiple mains,, slow...
Name: combined_features, dtype: object

Shape of the Features Matrix: (1628, 3225)


In [33]:
from sklearn.metrics.pairwise import cosine_similarity

# 1. Compute the cosine similarity matrix based on the features matrix
cosine_sim = cosine_similarity(features_matrix)

In [34]:
# 2. Create a function to get top 5 similar dramas based on input
def get_recommendations(drama_name, cosine_sim=cosine_sim):
    # Get the index of the drama that matches the title
    idx = data[data['Name'].str.lower() == drama_name.lower()].index[0]
    
    # Get the similarity scores for this drama with all others
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the dramas based on similarity scores (in descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the 5 most similar dramas (excluding itself)
    sim_scores = sim_scores[1:6]  # Skip the first one because it's the same drama
    
    # Get the drama indices
    drama_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 most similar dramas
    return data['Name'].iloc[drama_indices]


In [35]:

# Example usage
drama_name = 'Move to Heaven'  # You can change this to any drama name from your dataset
recommended_dramas = get_recommendations(drama_name)

# Print the recommendations
print(f"Top 5 recommendations for '{drama_name}':")
for i, rec in enumerate(recommended_dramas, 1):
    print(f"{i}. {rec}")


Top 5 recommendations for 'Move to Heaven':
1. Woman with a Suitcase
2. Bad Prosecutor
3. The Red Sleeve
4. Goblin
5. Moon Lovers: Scarlet Heart Ryeo
