In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import cluster

# Assign helpfull id's and process the dataset

In [15]:
data = pd.read_csv('../data/webscraped_raw.csv', sep=';')

# reformat date correctly
data['date'] = pd.to_datetime(data.Date, errors='coerce').dt.date

# substract season and episode number
data['Season_no'] = data['Season'].str.extract('(\d+)').astype('float').astype('Int64')
data['Episode_no'] = data['Episode'].str.extract('(\d+)').astype('float').astype('Int64')

# select top genre per season because there are double entries
show_genre = data.groupby(['Title', 'Genre']).count().Title_all.reset_index()
show_genre = show_genre.sort_values('Title_all').groupby('Title').first().reset_index(
                                    )[['Title', 'Genre']].rename({'Genre':'genre'}, axis=1)
data = data.merge(show_genre, on='Title')

# drop duplicate entries
data = data.drop_duplicates(subset=['Title', 'Season+Episode'], keep='first')

# assign Show_ID, Content_ID and Episode_ID (within a show)
data['Content_ID'] = data.index
season_id = pd.Series(data['Title'].unique()).rename('Title').reset_index().rename({'index':'Show_ID'}, axis=1)
data = data.merge(season_id, on='Title')
data['Episode_ID'] = data.sort_values(['Season_no', 'Episode_no']).groupby('Show_ID').cumcount() + 1

# select columns and rename
data = data[['Show_ID', 'Title', 'Content_ID', 'Episode_ID','Season_no', 'Episode_no', 'Season+Episode', 
             'Episode', 'genre', 'DurationMin', 'date', 'Year', 'Description', 'Image']]
data = data.rename({'season_no':'season','Episode_no':'Episode', 
             'Episode':'Episode_name', 'DurationMin':'Duration', 'date':'Date', 'genre':'Genre'}, axis=1)
data = data.fillna(value={'Season_no':'Other', 'Episode':'Unkown'})

# Cluster on description

In [18]:
nlp = spacy.load("en_core_web_sm")

texts = data.Description.values

#process all the descriptions
processed_texts = [text for text in nlp.pipe(texts, disable=["ner","parser"])]

# tokenize text, use lemmatized words, without stopwords or punctuation. 
tokenized_texts = [[word.lemma_ for word in processed_text
                                if not word.is_stop and not word.is_punct]
                                for processed_text in processed_texts]

strings = [[' '.join([str(w) for w in tokenized_text])] for tokenized_text in tokenized_texts]

for i in range(len(strings)):
    strings[i] = strings[i][0]
data['tokenized_text'] = strings

In [21]:
# make a new smaller dataset, and merge the tokenized text on a per-show basis
df_pershow = data[["Show_ID", "Title", "tokenized_text"]]
df_pershow = df_pershow.groupby(['Show_ID', 'Title'], as_index = False).agg({'tokenized_text': ' '.join})

In [2]:
# create a TF_IDF vector
vectorizer = TfidfVectorizer(min_df=3, max_df=0.9, norm='l2')
X = vectorizer.fit_transform(df_pershow['tokenized_text'])
tf_idf = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names())

In [23]:
#create clusters using K-means and TF-IDF for the similar shows recommendations
clusters = 25
kmeanModel = KMeans(n_clusters=clusters, init='k-means++', max_iter=3000, random_state=0)
mod = kmeanModel.fit_transform(tf_idf)
df_pershow['k_means'] = kmeanModel.predict(tf_idf)
df_pershow = df_pershow[["Show_ID", "Title", "k_means"]]

merged_data = pd.merge(data, df_pershow,
                        how="left", on=["Show_ID", "Title"])

tf_idf_df = pd.DataFrame(tf_idf)

In [27]:
# store processed data
tf_idf_df.to_csv('../data/tfidf.csv', index=False)
merged_data.to_csv('../data/BBC_episodes.csv', index=False) 