In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import cluster

In [3]:
data = pd.read_csv('../data/final_df.csv', sep=';')

# reformat date correctly
data['date'] = pd.to_datetime(data.Date, errors='coerce').dt.date

# substract season and episode number
data['Season_no'] = data['Season'].str.extract('(\d+)')
data['Episode_no'] = data['Episode'].str.extract('(\d+)')

# select top genre per season because there are double entries
show_genre = data.groupby(['Title', 'Genre']).count().Title_all.reset_index()
show_genre = show_genre.sort_values('Title_all').groupby('Title').first().reset_index(
                                    )[['Title', 'Genre']].rename({'Genre':'genre'}, axis=1)
data = data.merge(show_genre, on='Title')

# drop duplicate entries
data = data.drop_duplicates(subset=['Title', 'Season+Episode'], keep='first')

# assign Show_ID, Content_ID and Episode_ID (within a show)
data['Content_ID'] = data.index
season_id = pd.Series(data['Title'].unique()).rename('Title').reset_index().rename({'index':'Show_ID'}, axis=1)
data = data.merge(season_id, on='Title')
data['Episode_ID'] = data.sort_values(['Season_no', 'Episode']).groupby('Show_ID').cumcount() + 1

# select columns and rename
data = data[['Show_ID', 'Title', 'Content_ID', 'Episode_ID','Season_no', 'Episode_no', 'Season+Episode', 
             'Episode', 'genre', 'DurationMin', 'date', 'Year', 'Description', 'Image']]
data = data.rename({'season_no':'season','Episode_no':'Episode', 
             'Episode':'Episode_name', 'DurationMin':'Duration', 'date':'Date', 'genre':'Genre'}, axis=1)


In [4]:
texts = data.Description.values
nlp = spacy.load("en_core_web_sm")

In [5]:
#process all the descriptions

processed_texts = [text for text in nlp.pipe(texts, 
                                              disable=["ner",
                                                       "parser"])]

In [6]:
#Tokenize text, I use lemmatized words, without stopwords or punctuation. 
tokenized_texts = [[word.lemma_ for word in processed_text
                                if not word.is_stop and not word.is_punct]
                                for processed_text in processed_texts]


In [7]:
#
strings = [[' '.join([str(w) for w in tokenized_text])] for tokenized_text in tokenized_texts]

for i in range(len(strings)):
    strings[i] = strings[i][0]
data['tokenized_text'] = strings
data.tokenized_text[1]

'painting unknown man unknown artist turn great'

In [9]:
#make a new smaller dataset, and merge the tokenized text on a per-show basis
df_pershow = data[["Show_ID", "Title", "tokenized_text"]]
df_pershow = df_pershow.groupby(['Show_ID', 'Title'], as_index = False).agg({'tokenized_text': ' '.join})

In [11]:
#create a TF_IDF vector
vectorizer = TfidfVectorizer(min_df=3, max_df=0.9, norm='l2')
X = vectorizer.fit_transform(df_pershow['tokenized_text'])
tf_idf = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names())
tf_idf

Unnamed: 0,000,10,100,11,12,13,13th,14,15,150,...,zeinab,zero,zimbabwe,zip,zoe,zombie,zone,zoo,zoom,zuu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#create clusters using K-means and TF-IDF for the similar shows recommendations
clusters = 25
kmeanModel = KMeans(n_clusters=clusters, init='k-means++', max_iter=3000, random_state=0)
mod = kmeanModel.fit_transform(tf_idf)
df_pershow['k_means'] = kmeanModel.predict(tf_idf)

In [13]:
order_centroids = kmeanModel.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
dict = []
for i in range(clusters):
    print('%d' % i, sep='', end=', '),
    for ind in order_centroids[i, :20]:
        print(terms[ind], sep='', end=', ')
    print('')

0, story, sea, anne, west, people, tell, east, crew, explore, fire, read, league, pandemic, ruth, wildlife, premier, dougie, mark, barra, obama, 
1, find, life, new, try, face, family, look, take, meet, queen, reveal, late, challenge, home, question, live, comedy, peter, change, man, 
2, political, debate, interview, late, news, mp, guest, elizabeth, reynolds, amelia, tim, 1970, jane, james, williams, peter, leader, activity, plus, wales, 
3, rick, look, london, world, year, british, series, 1989, 1978, travel, louis, britain, adam, monty, black, fashion, interior, documentary, old, follow, 
4, explore, journey, travel, meet, river, bikers, robert, paul, stacey, ireland, visit, reggie, coast, mary, continue, bob, northern, life, group, macdonald, 
5, simon, reeve, travel, journey, mountains, leg, peninsula, aegean, explore, visit, begin, national, follow, cornwall, belonging, coast, glorious, world, island, kenya, 
6, good, zoo, series, itã, bit, radio, day, dom, rhod, katy, boy, drive

In [14]:
df_pershow = df_pershow[["Show_ID", "Title", "k_means"]]

In [17]:
merged_data = pd.merge(data, df_pershow,
                        how="left", on=["Show_ID", "Title"])
merged_data.head(7)

Unnamed: 0,Show_ID,Title,Content_ID,Episode_ID,Season_no,Episode,Season+Episode,Episode_name,Genre,Duration,Date,Year,Description,Image,tokenized_text,k_means
0,0,A Timewatch Guide,0,1,3,2,"Series 3: 2. Women, Sex and Society","2. Women, Sex and Society",history,59,2016-11-15,2016,How the transformation of the rights and role ...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,transformation right role woman document telev...,1
1,1,Britain's Lost Masterpieces,3,1,2,4,Series 2: 4. Arbroath,4. Arbroath,signed,59,2017-10-18,2017,A painting of an unknown man by an unknown art...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,painting unknown man unknown artist turn great,1
2,1,Britain's Lost Masterpieces,4,2,5,1,Series 5: 1. Brighton,1. Brighton,signed,59,2021-02-01,2021,Bendor Grosvenor and Emma Dabiri investigate t...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,Bendor Grosvenor Emma Dabiri investigate paint...,1
3,1,Britain's Lost Masterpieces,5,3,5,2,Series 5: 2. Tatton Park,2. Tatton Park,signed,59,2021-02-08,2021,Bendor and Emma discover a mysterious portrait...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,Bendor Emma discover mysterious portrait 16th ...,1
4,1,Britain's Lost Masterpieces,6,4,5,3,Series 5: 3. Glasgow,3. Glasgow,signed,59,2022-02-07,2022,Technical problems frustrate Bendor and EmmaÃ...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,technical problem frustrate Bendor EmmaÃÂ¢Ã¢Â...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21684,1481,GAA Live,34785,1,,,Armagh v Kildare,,sports,119,2022-03-12,2022,Live coverage of the Division One GAA football...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,live coverage Division GAA football match Arma...,9
21685,1481,GAA Live,34786,2,,,Down v Kerry,,sports,180,2022-03-19,2022,Live coverage of the Division Two GAA hurling ...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,live coverage Division GAA hurling match Kerry,9
21686,1482,BBC Sport NI,34787,1,,,Ulster Schools' Cup Rugby Final,,sports,120,2022-03-17,2022,Live coverage of the 2022 SchoolsÃÂ¢Ã¢ÂÂ¬Ã¢Â...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,live coverage 2022 SchoolsÃÂ¢Ã¢ÂÂ¬Ã¢ÂÂ¢ Cup...,9
21687,1483,The Football News Show,34788,1,,,22/03/2022,,sports,12,2022-03-22,2022,We focus on fans looking to make a change in f...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,focus fan look change football plus woman Cham...,1


In [21]:
tf_idf_df = pd.DataFrame(tf_idf)
tf_idf_df.to_csv('../data/tfidf.csv', index=False)

In [22]:
# store processed data
merged_data.to_csv('../data/BBC_episodes.csv', index=False) 