# Data Cleaning and Initial Models

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## EDA & Prepping Data for Modeling

In [3]:
# read into data
pod = pd.read_csv('podcast_data.csv')

In [4]:
pod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214088 entries, 0 to 214087
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   uuid         214087 non-null  object 
 1   title        214086 non-null  object 
 2   image        96117 non-null   object 
 3   description  96117 non-null   object 
 4   language     96111 non-null   object 
 5   categories   96111 non-null   object 
 6   website      96105 non-null   object 
 7   author       96105 non-null   object 
 8   itunes_id    96100 non-null   float64
dtypes: float64(1), object(8)
memory usage: 14.7+ MB


In [5]:
pod.head()

Unnamed: 0,uuid,title,image,description,language,categories,website,author,itunes_id
0,8d62d3880db2425b890b986e58aca393,"Ecommerce Conversations, by Practical Ecommerce",http://is4.mzstatic.com/image/thumb/Music6/v4/...,Listen in as the Practical Ecommerce editorial...,English,Technology,http://www.practicalecommerce.com,Practical Ecommerce,874457400.0
1,cbbefd691915468c90f87ab2f00473f9,Eat Sleep Code Podcast,http://is4.mzstatic.com/image/thumb/Music71/v4...,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,http://developer.telerik.com/,Telerik,1015556000.0
2,73626ad1edb74dbb8112cd159bda86cf,SoundtrackAlley,http://is5.mzstatic.com/image/thumb/Music71/v4...,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,https://soundtrackalley.podbean.com,Randy Andrews,1158189000.0
3,0f50631ebad24cedb2fee80950f37a1a,The Tech M&A Podcast,http://is1.mzstatic.com/image/thumb/Music71/v4...,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,http://www.corumgroup.com,Timothy Goddard,538160000.0
4,69580e7b419045839ca07af06cf0d653,"The Tech Informist - For fans of Apple, Google...",http://is4.mzstatic.com/image/thumb/Music62/v4...,The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,http://techinformist.com,The Tech Informist,916080500.0


In [6]:
# drop unncessary columns
pod.drop(columns=['uuid','image','website','itunes_id'],inplace=True)

In [7]:
pod.head(10)

Unnamed: 0,title,description,language,categories,author
0,"Ecommerce Conversations, by Practical Ecommerce",Listen in as the Practical Ecommerce editorial...,English,Technology,Practical Ecommerce
1,Eat Sleep Code Podcast,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,Telerik
2,SoundtrackAlley,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,Randy Andrews
3,The Tech M&A Podcast,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,Timothy Goddard
4,"The Tech Informist - For fans of Apple, Google...",The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,The Tech Informist
5,TechNative Podcasts,Interviews and commentary with the leaders in ...,English,Tech News | Business News | Business | Technology,TechNative
6,Sermons from Living Water Dayton,Living Water Dayton's weekly sermons that are...,English,Podcasting | Technology,Living Water
7,"Social Media, Mobility, Analytics, Cloud: Tech...",SMACtalk hosted by Brian Fanzo and Daniel Newm...,English,Management & Marketing | Business | Technology...,Brian Fanzo & Daniel Newman
8,Radio Leo (Video LO),"A compendium of netcasts from the Chief TWiT, ...",English,Tech News | Technology | News & Politics | Gad...,Leo Laporte
9,Digital India,"Suresh Babu, founder and head of the Web Marke...",English,Podcasting | Management & Marketing | Tech New...,Digital India


In [8]:
# create new dataframe where descriptions are not null
new_pod = pod[pod['description'].notnull()]

In [9]:
new_pod

Unnamed: 0,title,description,language,categories,author
0,"Ecommerce Conversations, by Practical Ecommerce",Listen in as the Practical Ecommerce editorial...,English,Technology,Practical Ecommerce
1,Eat Sleep Code Podcast,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,Telerik
2,SoundtrackAlley,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,Randy Andrews
3,The Tech M&A Podcast,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,Timothy Goddard
4,"The Tech Informist - For fans of Apple, Google...",The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,The Tech Informist
...,...,...,...,...,...
214083,BAFTA Guru,BAFTA is a world-leading independent arts char...,English,TV & Film,BAFTA
214084,Issa No Podcast,Podcast by Justus Ta'rique & R. Alexander,English,Comedy,Justus Ta'rique & R. Alexander
214085,Bent Notes,JOY's jazz show playing everything from ragtim...,English,Music,JOY 94.9
214086,The Junction,The best mix old of school and new school hip ...,English,Arts | TV & Film | Music,DJ Rome


In [10]:
# create a column for description length
lengths = []
for i in new_pod['description']:
    lengths.append(len(i.split()))
    
new_pod['desc_length'] = lengths

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pod['desc_length'] = lengths


In [11]:
# create a column with list of categories
cat_list = []
for cat in new_pod['categories']:
    cat_list.append(str(cat).split(' | '))
new_pod['cat_list'] = cat_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pod['cat_list'] = cat_list


In [12]:
new_pod

Unnamed: 0,title,description,language,categories,author,desc_length,cat_list
0,"Ecommerce Conversations, by Practical Ecommerce",Listen in as the Practical Ecommerce editorial...,English,Technology,Practical Ecommerce,15,[Technology]
1,Eat Sleep Code Podcast,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,Telerik,59,"[Tech News, Technology]"
2,SoundtrackAlley,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,Randy Andrews,11,"[Podcasting, Technology]"
3,The Tech M&A Podcast,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,Timothy Goddard,59,"[Business News, Technology, Tech News, Business]"
4,"The Tech Informist - For fans of Apple, Google...",The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,The Tech Informist,17,"[Gadgets, Tech News, Technology]"
...,...,...,...,...,...,...,...
214083,BAFTA Guru,BAFTA is a world-leading independent arts char...,English,TV & Film,BAFTA,34,[TV & Film]
214084,Issa No Podcast,Podcast by Justus Ta'rique & R. Alexander,English,Comedy,Justus Ta'rique & R. Alexander,7,[Comedy]
214085,Bent Notes,JOY's jazz show playing everything from ragtim...,English,Music,JOY 94.9,9,[Music]
214086,The Junction,The best mix old of school and new school hip ...,English,Arts | TV & Film | Music,DJ Rome,47,"[Arts, TV & Film, Music]"


In [13]:
# create new df for modeling with descriptions longer than 30 char and language equal to English
df = new_pod[(new_pod['desc_length'] > 10) & (new_pod['language'] == 'English')]

In [14]:
df

Unnamed: 0,title,description,language,categories,author,desc_length,cat_list
0,"Ecommerce Conversations, by Practical Ecommerce",Listen in as the Practical Ecommerce editorial...,English,Technology,Practical Ecommerce,15,[Technology]
1,Eat Sleep Code Podcast,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,Telerik,59,"[Tech News, Technology]"
2,SoundtrackAlley,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,Randy Andrews,11,"[Podcasting, Technology]"
3,The Tech M&A Podcast,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,Timothy Goddard,59,"[Business News, Technology, Tech News, Business]"
4,"The Tech Informist - For fans of Apple, Google...",The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,The Tech Informist,17,"[Gadgets, Tech News, Technology]"
...,...,...,...,...,...,...,...
214080,Eileen and Steph: Everything's Fine,"In a world where everything is falling apart, ...",English,Comedy,Eileen and Steph,18,[Comedy]
214082,That's How We Roll,Silly banter from some dumb 20 odd year olds p...,English,Games & Hobbies | Other Games,That's How We Roll Podcast,17,"[Games & Hobbies, Other Games]"
214083,BAFTA Guru,BAFTA is a world-leading independent arts char...,English,TV & Film,BAFTA,34,[TV & Film]
214086,The Junction,The best mix old of school and new school hip ...,English,Arts | TV & Film | Music,DJ Rome,47,"[Arts, TV & Film, Music]"


In [15]:
# double check that only English podcasts remains
df['language'].value_counts()

English    74632
Name: language, dtype: int64

In [17]:
# reset index
df.reset_index(inplace=True)

# drop old index column
df.drop(columns='index', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [18]:
df

Unnamed: 0,title,description,language,categories,author,desc_length,cat_list
0,"Ecommerce Conversations, by Practical Ecommerce",Listen in as the Practical Ecommerce editorial...,English,Technology,Practical Ecommerce,15,[Technology]
1,Eat Sleep Code Podcast,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,Telerik,59,"[Tech News, Technology]"
2,SoundtrackAlley,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,Randy Andrews,11,"[Podcasting, Technology]"
3,The Tech M&A Podcast,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,Timothy Goddard,59,"[Business News, Technology, Tech News, Business]"
4,"The Tech Informist - For fans of Apple, Google...",The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,The Tech Informist,17,"[Gadgets, Tech News, Technology]"
...,...,...,...,...,...,...,...
74627,Eileen and Steph: Everything's Fine,"In a world where everything is falling apart, ...",English,Comedy,Eileen and Steph,18,[Comedy]
74628,That's How We Roll,Silly banter from some dumb 20 odd year olds p...,English,Games & Hobbies | Other Games,That's How We Roll Podcast,17,"[Games & Hobbies, Other Games]"
74629,BAFTA Guru,BAFTA is a world-leading independent arts char...,English,TV & Film,BAFTA,34,[TV & Film]
74630,The Junction,The best mix old of school and new school hip ...,English,Arts | TV & Film | Music,DJ Rome,47,"[Arts, TV & Film, Music]"


## Clean category and description

In [None]:
# pull out the first category from the category list as a filtering option
prim_cat = []
for cat in df['cat_list']:
    prim_cat.append(cat[0])
    
df['primary_cat'] = prim_cat

In [20]:
# from Samantha Chu to clean text by removing special characters
def remove_special_chars(text):
    regex_tokenizer = RegexpTokenizer("\w+[']*")
    words = regex_tokenizer.tokenize(text.lower())
    return (' ').join(words)

In [21]:
# create a clean description column
df['clean_desc'] = df['description'].map(remove_special_chars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_desc'] = df['description'].map(remove_special_chars)


In [22]:
df

Unnamed: 0,title,description,language,categories,author,desc_length,cat_list,primary_cat,clean_desc
0,"Ecommerce Conversations, by Practical Ecommerce",Listen in as the Practical Ecommerce editorial...,English,Technology,Practical Ecommerce,15,[Technology],Technology,listen in as the practical ecommerce editorial...
1,Eat Sleep Code Podcast,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,Telerik,59,"[Tech News, Technology]",Tech News,on the show we ll be talking to passionate peo...
2,SoundtrackAlley,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,Randy Andrews,11,"[Podcasting, Technology]",Podcasting,a podcast about soundtracks and movies from my...
3,The Tech M&A Podcast,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,Timothy Goddard,59,"[Business News, Technology, Tech News, Business]",Business News,the tech m a podcast pulls from the best of th...
4,"The Tech Informist - For fans of Apple, Google...",The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,The Tech Informist,17,"[Gadgets, Tech News, Technology]",Gadgets,the tech news show with two guys shooting the ...
...,...,...,...,...,...,...,...,...,...
74627,Eileen and Steph: Everything's Fine,"In a world where everything is falling apart, ...",English,Comedy,Eileen and Steph,18,[Comedy],Comedy,in a world where everything is falling apart o...
74628,That's How We Roll,Silly banter from some dumb 20 odd year olds p...,English,Games & Hobbies | Other Games,That's How We Roll Podcast,17,"[Games & Hobbies, Other Games]",Games & Hobbies,silly banter from some dumb 20 odd year olds p...
74629,BAFTA Guru,BAFTA is a world-leading independent arts char...,English,TV & Film,BAFTA,34,[TV & Film],TV & Film,bafta is a world leading independent arts char...
74630,The Junction,The best mix old of school and new school hip ...,English,Arts | TV & Film | Music,DJ Rome,47,"[Arts, TV & Film, Music]",Arts,the best mix old of school and new school hip ...


## Combine text data

In [23]:
# create new column combining title, author, clean_desc, and clean_cat
df['all_text'] = df['title'] + " " + df['author'] + " " + df['clean_desc']
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_text'] = df['title'] + " " + df['author'] + " " + df['clean_desc']


Unnamed: 0,title,description,language,categories,author,desc_length,cat_list,primary_cat,clean_desc,all_text
0,"Ecommerce Conversations, by Practical Ecommerce",Listen in as the Practical Ecommerce editorial...,English,Technology,Practical Ecommerce,15,[Technology],Technology,listen in as the practical ecommerce editorial...,"Ecommerce Conversations, by Practical Ecommerc..."
1,Eat Sleep Code Podcast,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,Telerik,59,"[Tech News, Technology]",Tech News,on the show we ll be talking to passionate peo...,Eat Sleep Code Podcast Telerik on the show we ...
2,SoundtrackAlley,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,Randy Andrews,11,"[Podcasting, Technology]",Podcasting,a podcast about soundtracks and movies from my...,SoundtrackAlley Randy Andrews a podcast about ...
3,The Tech M&A Podcast,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,Timothy Goddard,59,"[Business News, Technology, Tech News, Business]",Business News,the tech m a podcast pulls from the best of th...,The Tech M&A Podcast Timothy Goddard the tech ...
4,"The Tech Informist - For fans of Apple, Google...",The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,The Tech Informist,17,"[Gadgets, Tech News, Technology]",Gadgets,the tech news show with two guys shooting the ...,"The Tech Informist - For fans of Apple, Google..."
...,...,...,...,...,...,...,...,...,...,...
74627,Eileen and Steph: Everything's Fine,"In a world where everything is falling apart, ...",English,Comedy,Eileen and Steph,18,[Comedy],Comedy,in a world where everything is falling apart o...,Eileen and Steph: Everything's Fine Eileen and...
74628,That's How We Roll,Silly banter from some dumb 20 odd year olds p...,English,Games & Hobbies | Other Games,That's How We Roll Podcast,17,"[Games & Hobbies, Other Games]",Games & Hobbies,silly banter from some dumb 20 odd year olds p...,That's How We Roll That's How We Roll Podcast ...
74629,BAFTA Guru,BAFTA is a world-leading independent arts char...,English,TV & Film,BAFTA,34,[TV & Film],TV & Film,bafta is a world leading independent arts char...,BAFTA Guru BAFTA bafta is a world leading inde...
74630,The Junction,The best mix old of school and new school hip ...,English,Arts | TV & Film | Music,DJ Rome,47,"[Arts, TV & Film, Music]",Arts,the best mix old of school and new school hip ...,The Junction DJ Rome the best mix old of schoo...


## Tfidf model with description only

In [25]:
# from Aaron Hume to create add custom words to stop_words list
stop_words = list(TfidfVectorizer(stop_words='english').get_stop_words()) + ['Podcast','podcast','Podcasts','podcasts']

In [35]:
# initializing Tfidf and fit transform on df
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 3), stop_words = stop_words)
tfdf = tf.fit_transform(df['clean_desc'].iloc[:45_000])
tfdf

<45000x1688151 sparse matrix of type '<class 'numpy.float64'>'
	with 2916819 stored elements in Compressed Sparse Row format>

In [36]:
# find similarity score between the descriptions
similarity = linear_kernel(tfdf, tfdf)
similarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00374187],
       [0.        , 1.        , 0.        , ..., 0.        , 0.00282636,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.00282636, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.00374187, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [37]:
# https://www.kaggle.com/switkowski/building-a-podcast-recommendation-engine

x = df[df['title'] == 'SoundtrackAlley'].index[0]
similar_idx = similarity[x].argsort(axis = 0)[-6:-1]
for i in similar_idx:
    print(similarity[x][i], '-', df['title'][i], '-', df['description'][i], '\n')
print('Original - ' + df['description'][x])

0.0878103011879654 - Myopia: Defend Your Childhood - A Nostalgic Movies Podcast - We watch the movies you grew up with. Do they hold up? Can you defend your childhood? 

0.101255314802917 - Ruin My Childhood - Mike and Kat watch cherished childhood TV shows and movies. They find out if watching these cherished titles ruin their childhood. Are some things better left in the past? Find out on Ruin My Childhood. 

0.10532749547607716 - Upon Closer Inspection: Revisiting Movies from our Childhood - Was My Girl as sad as you remember? Was The Witches as terrifying? Was the Babysitters Club on of the worst movies ever made? On Upon Closer Inspection, hosts Allison, Graham, and guests, take a hilarious look at the movies of their childhood and find out if they stand the test of time, or if they’re just a product of their time. 

0.11597686223309743 - The Test of Time - Are the classic movies of our childhood really as great as we remember them? Hosts Alan Noah and James Brief take a fresh loo

## Tfidf model with all text

In [None]:
# from Aaron Hume to create add custom words to stop_words list
stop_words = list(TfidfVectorizer(stop_words='english').get_stop_words()) + ['Podcast','podcast','Podcasts','podcasts']

In [26]:
# initializing Tfidf and fit transform on df
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (2, 3), stop_words = stop_words)
tfdf_all = tf.fit_transform(df['all_text'].iloc[:45_000])
tfdf_all

<45000x1919753 sparse matrix of type '<class 'numpy.float64'>'
	with 2348011 stored elements in Compressed Sparse Row format>

In [27]:
# find similarity score between the descriptions
similarity_all = linear_kernel(tfdf_all, tfdf_all)
similarity_all

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [28]:
# https://www.kaggle.com/switkowski/building-a-podcast-recommendation-engine

x = df[df['title'] == 'SoundtrackAlley'].index[0]
similar_idx = similarity_all[x].argsort(axis = 0)[-6:-1]
for i in similar_idx:
    print(similarity_all[x][i], '-', df['title'][i], '-', df['all_text'][i], '\n')
print('Original - ' + df['all_text'][x])

0.0 - Unlock Your Potential - Unlock Your Potential Laura Gallaher dr laura gallaher was hired by nasa kennedy space center working to change the culture following the 2003 columbia shuttle disaster after 8 years with nasa leaders at all levels laura was hired by the walt disney company to help them radically change their performance management approach which was focused on developing leaders as coaches for their employees in 2013 laura founded key talent solutions and helped her first client an orlando tech company go from lay offs to a 36m acquisition in under 18 months laura helps executive teams be more aware more aligned and more accountable by helping them have conversations they didn t even know they needed to have 

0.0 - GraceUMChurch: Sermon Podcast - GraceUMChurch: Sermon Podcast Grace United Methodist Church each sunday grace united methodist church in aberdeen md records the scripture readings and sermon from worship listen here to strengthen your faith grow your theology 

## Save data for modeling

In [24]:
df.to_csv('model_data.csv')