# Import Packages and Create Spark Builder

In [None]:
import numpy as np 
import re as re
import databricks.koalas as ks

from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

In [None]:
spark = SparkSession\
    .builder\
    .master("local[*]")\
    .appName("SpotifyPodcastClassification")\
    .config('spark.driver.memory', '24g')\
    .getOrCreate()

# 1. Read full data
## 1.1 Podcasts data

In [None]:
podcasts_df = spark.read.options(inferSchema='True',delimiter=',',header='True') \
                               .csv("../script_output/02_final_dat.csv")
podcasts_df.count()

105153

In [None]:
podcasts_df = podcasts_df.select('episode_uri', 'show_description')
podcasts_df.show(5)

+--------------------+--------------------+
|         episode_uri|    show_description|
+--------------------+--------------------+
|spotify:episode:0...|A 20-something bl...|
|spotify:episode:0...|Ever wonder what ...|
|spotify:episode:0...|Inside the 18 is ...|
|spotify:episode:0...|Your favorite pod...|
|spotify:episode:0...|The comedy podcas...|
+--------------------+--------------------+
only showing top 5 rows


## 1.2 Manually generated topics

In [None]:
clusters_df = spark.read.options(inferSchema='True', delimiter=',', header='True')\
    .csv('../script_output/topics_manually_created.csv')
clusters_kdf = clusters_df.to_koalas()
clusters_kdf = clusters_kdf.fillna('')
clusters_kdf.head(25)

Comedy	Health and fitness	News	Politics	Pop culture	Religion	Sports	True crime
0	comedy	health	news	politics	pop culture	religion	sports	true crime
1	absurd	fitness	technical	arguments	pop	satanism	NFL	cold cases
2	funny	mental health	sports news	juicy	campus	bible	football	mystery
3	funnier	weight	weekly news	political	adulting	christianity	soccer	crime scene
4	effing	healthy	social news	journalist	hip-hop	religions	premier league	murder
5	fuckbois	healthy weight	market	investigative	film	faith	la liga	unsolved murders
6	Idiots	ketogenic	stock		tv shows	spirituality	champions league	drama
7	ranting	diet	stock market		music	god	liga	victims
8	improvisers	body type			marvel	conciousness	league	victim
9	entertain	intermittent fasting			disney	third eye	boxing	
10	satire	weight loss			movie		fighters	
11	funniest	body health			controversial			
12		skincare			life			
13		makeup			stories			
14		beauty			DC			
15		treatment			movies			
16		self-help						
17		selfcare						
18		meditation						
19		mental health						
20		body image						
21		nutrition						
22		astrology						
23		self-development						
24		healing	

# 2. Functions

In [None]:
def cossim(v1, v2): 
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))+.1)

def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result == 'nan':
        result = 0
        
    return result

def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.key_to_index:
            output.append(word.strip())
            
    return output

# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    try: 
        output = vectors.n_similarity(s1words, s2words)
    except: 
        output = 0
    return output

def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

# 3. Word2Vec with Gensim

In [None]:
model_path = ''
model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)

Loading Word2Vec model...
Finished loading Word2Vec model...

In [None]:
podcasts_pdf=podcasts_df.toPandas()
podcasts_pdf = podcasts_pdf[['episode_uri', 'show_description']]
podcasts_pdf.head()

	episode_uri	show_description
0	spotify:episode:000A9sRBYdVh66csG2qEdj	A 20-something blunt female takes on the world...
1	spotify:episode:000HP8n3hNIfglT2wSI2cA	Ever wonder what murder took place on today in...
2	spotify:episode:001UfOruzkA3Bn1SPjcdfa	Inside the 18 is your source for all things Go...
3	spotify:episode:001i89SvIQgDuuyC53hfBm	Your favorite podcast for everything @Chiefs! ...
4	spotify:episode:0025RWNwe2lnp6HcnfzwzG	The comedy podcast about toxic characters, wri...


In [None]:
for idx,i in enumerate(clusters_kdf.columns.tolist()): 
    # Build search query
    search_query = ' '.join(clusters_kdf[i].tolist())
    podcasts_pdf[i] = podcasts_pdf.apply(lambda row: calc_similarity(search_query, row.show_description, model_word2vec), axis = 1)
    
    # Status check
    print(f"Completed loop {idx+1}/{len(clusters_kdf.columns.tolist())}...")
podcasts_pdf.head()

21/12/13 11:08:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
Completed loop 1/8...
Completed loop 2/8...
Completed loop 3/8...
Completed loop 4/8...
Completed loop 5/8...
Completed loop 6/8...
Completed loop 7/8...
Completed loop 8/8...

	episode_uri	show_description	Comedy	Health and fitness	News	Politics	Pop culture	Religion	Sports	True crime
0	spotify:episode:000A9sRBYdVh66csG2qEdj	A 20-something blunt female takes on the world...	0.351232	0.391297	0.343048	0.317570	0.465362	0.280541	0.252791	0.272252
1	spotify:episode:000HP8n3hNIfglT2wSI2cA	Ever wonder what murder took place on today in...	0.400637	0.439645	0.431746	0.375465	0.550336	0.425099	0.353148	0.503461
2	spotify:episode:001UfOruzkA3Bn1SPjcdfa	Inside the 18 is your source for all things Go...	0.348215	0.445070	0.429135	0.337475	0.430267	0.288212	0.402266	0.282152
3	spotify:episode:001i89SvIQgDuuyC53hfBm	Your favorite podcast for everything @Chiefs! ...	0.351075	0.390580	0.510007	0.297533	0.481944	0.290662	0.261531	0.226690
4	spotify:episode:0025RWNwe2lnp6HcnfzwzG	The comedy podcast about toxic characters, wri...	0.489705	0.399399	0.401080	0.397278	0.505520	0.395204	0.307015	0.329007


In [None]:
podcasts_pdf['cluster'] = podcasts_pdf.iloc[:, 2:].apply(lambda s: s.nlargest(3).index.tolist(), axis=1)
podcasts_pdf.head()

# 4. Join the Full Metadata for Manual Evaluation

In [None]:
full_data = pd.read_csv('../script_output/episode_transcript_data_w_metadata.csv')
full_data = full_data[['show_uri', 'show_name', 'publisher', 'episode_uri', 'episode_name', 'episode_description', 'transcript']]
full_data.head()

In [None]:
final_result = pd.merge(podcasts_pdf, full_data, on='episode_uri', how = 'left')
final_result = final_result[['show_name','episode_uri', 'show_description', 'episode_description', 'cluster', 
                             'True crime', 'Comedy', 'Health and fitness', 'News', 'Politics', 'Pop culture', 'Religion', 'Sports']]
manual_annotate_podcasts = final_result.loc[:50,['episode_uri', 'show_description', 'cluster']]
manual_annotate_podcasts.to_csv('../script_output/03b_manual_word2vec_manually_annotate.csv', index = False)
final_result.head(2)

In [None]:
final_result.to_csv('../script_output/03b_manual_word2vec_final_result.csv')