In [2]:
import glob
from bs4 import BeautifulSoup
import os
import pandas as pd
import spacy
import numpy as np


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import cluster

import os

In [3]:
df_bbc = pd.read_csv('../data/BBC_Dataset.csv', sep=";")
display(df_bbc)

Unnamed: 0,Title,Image,Description,Genre,Duration,Date
0,Roy Lichtenstein: Pop Idol,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,Roy Lichtenstein revolutionised art with his c...,arts,29 mins,25 Feb 2004
1,Civilisation - 13. Heroic Materialism,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,Kenneth Clark considers heroic materialism and...,arts,51 mins,8:15pm 18 May 1969
2,Mark Lawson Talks To... - Barry Cryer,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,Comedy writer and raconteur Barry Cryer in con...,arts,59 mins,10:50pm 2 Apr 2008
3,The Birth of British Music - 1. Purcell - The ...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,"The life and music of Henry Purcell, from tave...",arts,59 mins,11 May 2009
4,Fleetwood Mac: Don't Stop,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,The story of one of the biggest-selling bands ...,arts,59 mins,10:20pm 1 Nov 2009
...,...,...,...,...,...,...
3798,Bellator MMA - 2022: Bellator 274: Main Event ...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,Full coverage of Bellator 274 at the Mohegan S...,sports,333 mins,19 Feb 2022
3799,Scottish Women's Premier League 2021-22 - Celt...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,Live coverage as Fran Alonso's Celtic take on ...,sports,135 mins,20 Feb 2022
3800,Athletics - 2022: Birmingham Indoor Grand Prix,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,Live coverage from Birmingham Indoor Grand Prix.,sports,189 mins,19 Feb 2022
3801,Match of the Day Wales - 2021/22: Pinatar Cup ...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,Wales face Republic of Ireland in the third-pl...,sports,121 mins,22 Feb 2022


In [4]:
#dropping dupes, adding an ID
df_bbc = df_bbc.drop_duplicates(subset=['Title'])
df_bbc = df_bbc.reset_index(drop=True)
df_bbc['ID'] = df_bbc.index

In [5]:
texts = df_bbc.Description.values
nlp = spacy.load("en_core_web_sm")

In [6]:
processed_texts = [text for text in nlp.pipe(texts, 
                                              disable=["ner",
                                                       "parser"])]

In [7]:

tokenized_texts = [[word.lemma_ for word in processed_text
                                if not word.is_stop and not word.is_punct]
                                for processed_text in processed_texts]


In [8]:
strings = [[' '.join([str(w) for w in tokenized_text])] for tokenized_text in tokenized_texts]

for i in range(len(strings)):
    strings[i] = strings[i][0]

In [9]:
df_bbc['text'] = strings

In [10]:
vectorizer = TfidfVectorizer(min_df=2, max_df=0.9, norm='l2')
X = vectorizer.fit_transform(df_bbc['text'])
tf_idf = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names())
tf_idf

Unnamed: 0,000,10,100,11,12,13,145,15,150,16,...,young,yucatan,yung,zand,zara,zealand,zeinab,zombie,zone,zoo
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.420254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2751,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2752,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2753,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2754,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#create clusters using K-means and TF-IDF for the similar shows recommendations
clusters = 15
kmeanModel = KMeans(n_clusters=clusters, init='k-means++', max_iter=3000, random_state=0)
mod = kmeanModel.fit_transform(tf_idf)
df_bbc['k_means'] = kmeanModel.predict(tf_idf)

In [12]:
order_centroids = kmeanModel.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
dict = []
for i in range(clusters):
    print('%d' % i, sep='', end=', '),
    for ind in order_centroids[i, :20]:
        print(terms[ind], sep='', end=', ')
    print('')

0, explore, life, face, world, man, film, music, travel, bbc, follow, try, final, join, documentary, change, young, work, star, big, get, 
1, family, discover, convince, young, food, great, celebrate, hold, secret, scientist, gran, pup, way, house, follow, cost, join, west, future, different, 
2, look, year, life, series, moment, good, era, steam, documentary, highlight, episode, world, past, break, 2019, take, india, railway, find, scottish, 
3, find, time, way, try, people, race, maddie, learn, work, life, andy, run, man, join, nadiya, take, explore, period, vital, clever, 
4, news, late, weather, sport, political, debate, interview, england, bbc, mp, east, west, feature, international, national, north, south, midlands, yorkshire, wales, 
5, christmas, celebrate, make, dance, tree, little, special, recipe, time, great, music, decision, team, festive, big, film, year, sing, true, delicious, 
6, live, coverage, bbc, birmingham, watch, match, 2022, host, present, night, performance, rad

In [13]:
df_bbc.to_csv("../data/BBC_proccessed.csv")