# Recommendation system using NLP

[Data Source - Kaggle](https://www.kaggle.com/shivamb/netflix-shows)

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df = pd.read_csv("data/netflix_titles.csv.zip",compression='zip', sep= ',')
print(df.shape)
df.head()

(8807, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [5]:
df.type.unique()

array(['Movie', 'TV Show'], dtype=object)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [15]:
df.fillna("NA", inplace=True)

In [43]:
movies = df.loc[df.type == "Movie"]
movies.reset_index(drop=True, inplace=True)
movies.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


In [44]:
movies['target'] = movies['title'] +' '+ movies['director'] + ' ' + movies['description']  + ' ' + movies['listed_in']+ ' ' + movies['cast']
movies.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,target
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Dick Johnson Is Dead Kirsten Johnson As her fa...


In [45]:
movie_vectorizer = TfidfVectorizer(max_features=1000)
movie_matrix = movie_vectorizer.fit_transform(movies.target.values)
movie_matrix.shape

(6131, 1000)

In [46]:
similarity_matrix = cosine_similarity(movie_matrix)
print(similarity_matrix.shape)
print(type(similarity_matrix))
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df.shape

(6131, 6131)
<class 'numpy.ndarray'>


(6131, 6131)

In [64]:
movies.title #.sample(2)

0                   Dick Johnson Is Dead
1       My Little Pony: A New Generation
2                                Sankofa
3                           The Starling
4                           Je Suis Karl
                      ...               
6126                             Zinzana
6127                              Zodiac
6128                          Zombieland
6129                                Zoom
6130                              Zubaan
Name: title, Length: 6131, dtype: object

In [65]:
tgt_movie = "Zombieland"
tgt_idx = movies[movies.title == tgt_movie].index[0]
tgt_idx

6128

In [66]:
similarity_df[tgt_idx].sort_values(ascending=False)

6128    1.000000
5639    0.290704
3793    0.285768
4564    0.241797
5575    0.235897
          ...   
1457    0.000000
2221    0.000000
4507    0.000000
3235    0.000000
5749    0.000000
Name: 6128, Length: 6131, dtype: float64

In [67]:
similarity_df[tgt_idx].sort_values(ascending=False).index.values

array([6128, 5639, 3793, ..., 4507, 3235, 5749], dtype=int64)

In [68]:
top_5_movie_match_index = similarity_df[tgt_idx].sort_values(ascending=False).index.values[1:6]
top_5_movie_match_index

array([5639, 3793, 4564, 5575, 5825], dtype=int64)

In [69]:
top_5_movie_match_index = movies.iloc[top_5_movie_match_index]['title']
print("\n".join(top_5_movie_match_index))

The Bill Murray Stories: Life Lessons Learned From a Mythical Man
A Very Murray Christmas
Get Smart
Superbad
The New Romantic
