<a href="https://colab.research.google.com/github/Sokolik-data/ML/blob/main/rec_netf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
#from google.colab import files
#files.upload()

In [2]:
import pandas as pd
df = pd.read_csv('netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [7]:
# del duplicates
df = df.drop_duplicates().reset_index(drop=True)

# create new df with selected columns
keep_cols = ['show_id',
 'type',
 'title',
 'director',
 'cast',
 'country',
 'release_year',
 'rating',
 'duration',
 'listed_in',
 'description']
df = df[[c for c in keep_cols if c in df.columns]]
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [8]:
# change Nan to ''
for c in ['director', 'cast',	'country', 'rating',	'duration']:
    if c in df.columns:
        df[c] = df[c].fillna('')

print("Remaining columns:", df.columns.tolist())
df.head(2)

Remaining columns: ['show_id', 'type', 'title', 'director', 'cast', 'country', 'release_year', 'rating', 'duration', 'listed_in', 'description']


Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [11]:
import ast, re
def extract_names(x):
    if pd.isna(x) or x == '':
        return ''
    try:
        parsed = ast.literal_eval(x)
        if isinstance(parsed, list):
            names = [d.get('name') for d in parsed if isinstance(d, dict) and 'name' in d]
            return ' '.join([str(n) for n in names if n])
        if isinstance(parsed, dict):
            return ' '.join([str(v) for v in parsed.values() if isinstance(v, (str,int))])
    except Exception:
        pass
    return re.sub(r'[\[\]\{\}\"\']', ' ', str(x))

for c in ['listed_in', 'cast']:
    if c in df.columns:
        df[c + '_clean'] = df[c].apply(extract_names)

components = []
for c in ['description','listed_in_clean','rating','cast_clean','director', 'type']:
    if c in df.columns:
        components.append(c)
def create_soup(row):
    return ' '.join([str(row.get(c,'')) for c in components])

df['soup'] = df.apply(create_soup, axis=1)

df[['title','soup']].head(3)


Unnamed: 0,title,soup
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm..."
1,Blood & Water,"After crossing paths at a party, a Cape Town t..."
2,Ganglands,To protect his family from a powerful drug lor...


In [12]:
import sklearn.preprocessing
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english', max_features=50000)
tfidf_matrix = tfidf.fit_transform(df['soup'])
print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (8807, 49950)


In [13]:
def recommend(movie_index, tfidf_matrix, top_n=10):
    sim = linear_kernel(tfidf_matrix[movie_index], tfidf_matrix).flatten()
    top_idx = sim.argsort()[-top_n-1:-1][::-1]
    return top_idx

In [14]:
movie_index = 0
indices = recommend(movie_index, tfidf_matrix, top_n=10)
print("Top recommended indices:", indices)

Top recommended indices: [4877 7454 7015 8042 5233 2350 3927 4114 5797 7481]


In [17]:
def show_recommendations(movie_index, tfidf_matrix, top_n=10):
    idxs = recommend(movie_index, tfidf_matrix, top_n)
    return df.iloc[idxs][['title', 'description']]

show_recommendations(0, tfidf_matrix, top_n=10)

Unnamed: 0,title,description
4877,End Game,"Facing an inevitable outcome, terminally ill p..."
7454,Midnight Special,"When his son exhibits supernatural powers, a f..."
7015,How to Be a Player,Dray lives life one woman at a time and is the...
8042,Small Soldiers,"When the Commando Elite, a group of toy action..."
5233,The Death and Life of Marsha P. Johnson,As she fights the tide of violence against tra...
2350,Woodshock,"Shattered after her mother's death, a woman fl..."
3927,New Girl,"Still rebounding from a breakup, Jessica Day m..."
4114,Barbie Dreamhouse Adventures,Get to know Barbie and her BFFs – including ne...
5797,Extremis,Witness the wrenching emotions that accompany ...
7481,Mona Lisa Smile,"In 1953, the women of Wellesley College are me..."


In [18]:
def get_index_from_title(title):
    return df[df['title'].str.lower() == title.lower()].index.values[0]

movie_index = get_index_from_title("New Girl")
show_recommendations(movie_index, tfidf_matrix, top_n=10)

Unnamed: 0,title,description
8199,The Bachelor,A single man searches for his soulmate through...
795,Happy Endings,"After his fiancée, Alex, dumps him at the alta..."
321,Two Fathers,When two single guys learn a woman they both s...
7955,Scary Movie,The Wayans brothers spoof some of Hollywood's ...
2294,Desperados,"After drunkenly sending a cringeworthy email, ..."
1144,Yes Man,"After a bitter divorce, a loan officer falls u..."
829,Dog Gone Trouble,The privileged life of a pampered dog named Tr...
2052,"Love, Guaranteed",Sparks fly when a crusading but cash-strapped ...
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm..."
3119,Twice Upon A Time,"Months after a crushing breakup, a man receive..."


In [21]:
def recommend_movies(title, n=10):
    # search about film name
    idx = df[df['title'].str.lower() == title.lower()].index
    if len(idx) == 0:
        return f"Movie '{title}' not found."
    idx = idx[0]


    movie_indices = recommend(idx, tfidf_matrix, top_n=n)
    return df[['title','release_year','description']].iloc[movie_indices]

# Example
recommend_movies("How to Be a Player", 5)


Unnamed: 0,title,release_year,description
0,Dick Johnson Is Dead,2020,"As her father nears the end of his life, filmm..."
359,The Original Kings of Comedy,2000,"Comedians Steve Harvey, Cedric the Entertainer..."
67,Saved by the Bell,1994,"From middle school to college, best friends Za..."
146,House Party 3,1994,"After Kid gets engaged, Play plans to throw th..."
4529,Dancing Queen,2018,Snatching trophies. Getting gorgeous. Turning ...
