# Continuted.
### See the first collabortive.ipynb for data understanding

#### Here we import, clean the data based on past understanding
#### Then we deploy a database with the matrix for the vectorized content filter. We included both movie title and description to come up with better recommendations

In [2]:
# Import
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

df_ratings = pd.read_csv('movies_ratings.csv')
df_titles = pd.read_csv('movies_titles.csv')
df_users = pd.read_csv('movies_users.csv')

In [3]:
df_titles.head()

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,description,...,Language TV Shows,Musicals,Nature TV,Reality TV,Spirituality,TV Action,TV Comedies,TV Dramas,Talk Shows TV Comedies,Thrillers
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,"Michael Hilow, Ana Hoffman, Dick Johnson, Kirs...",United States,2020,PG-13,90 min,As her father nears the end of his life filmma...,...,0,0,0,0,0,0,0,0,0,0
1,s2,TV Show,Blood & Water,,Ama Qamata Khosi Ngema Gail Mabalane Thabang M...,South Africa,2021,TV-MA,2 Seasons,After crossing paths at a party a Cape Town te...,...,0,0,0,0,0,0,0,1,0,0
2,s3,TV Show,Ganglands,Julien Leclercq,Sami Bouajila Tracy Gotoas Samuel Jouy Nabiha ...,,2021,TV-MA,1 Season,To protect his family from a powerful drug lor...,...,0,0,0,0,0,1,0,0,0,0
3,s4,TV Show,Jailbirds New Orleans,,,,2021,TV-MA,1 Season,Feuds flirtations and toilet talk go down amon...,...,0,0,0,1,0,0,0,0,0,0
4,s5,TV Show,Kota Factory,,Mayur More Jitendra Kumar Ranjan Raj Alam Khan...,India,2021,TV-MA,2 Seasons,In a city of coaching centers known to train I...,...,0,0,0,0,0,0,1,0,0,0


In [4]:
# Explore. (check the collaborative ipynb file for all data exploration and understanding)

## Cleaned data by droping director row, and deleting the rest of the rows missing cast or/and country

In [5]:
# Data Cleaning if needed
df_titles.drop('director', axis=1, inplace=True)
df_titles.dropna(inplace=True)
df_ratings = df_ratings[df_ratings['show_id'].isin(df_titles['show_id'])]
# df.reset_index(inplace=True) if id is set as index value before line is ran

In [6]:
# Find the common show_ids in both dataframes
common_ids = set(df_titles['show_id']).intersection(set(df_ratings['show_id']))

# Filter both dataframes to only these common show_ids
df_titles = df_titles[df_titles['show_id'].isin(common_ids)]
df_ratings = df_ratings[df_ratings['show_id'].isin(common_ids)]

## Modeling and evaluation step, based on title and description we made the matrix

In [7]:
# Modeling
# May need to add our own dictionary
# Create a TfidfVectorizer and Remove stopwords
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data to a tfidf matrix
df_titles['combined'] = df_titles['title'] + ' ' + df_titles['description']
tfidf_matrix = tfidf.fit_transform(df_titles['combined'])
        
# Print the shape of the tfidf_matrix
print(tfidf_matrix.shape)

(1592, 8587)


# Deployment

In [8]:
# Deployment
# Compute the cosine similarity between each movie description
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
      
# For easier viewing, put it in a dataframe
pd.DataFrame(cosine_sim)

df_results = pd.DataFrame(cosine_sim, index=df_titles['show_id'], columns=df_titles['show_id']) #.sort_values(by=[0], ascending=False)

## Database creation for the content recommendation

In [9]:
from sqlalchemy import create_engine

engine = create_engine("sqlite:///content_recs.db")
df_results.to_sql('content_recs1', con=engine, if_exists='append', index=True)

1592