In [1]:
# Imports
import unicodedata, collections, bs4, gensim, nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(7)
import pandas as pd
from bs4 import BeautifulSoup
nltk.download('wordnet');

unable to import 'smart_open.gcs', disabling that module
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Custom functions

In [2]:
# Functions we will need for later

# Function to clean articles of html tags crawled from website
def recursive_check(entity, list_to_append_to):
    if type(entity) == bs4.element.Tag:
        if (entity.text != ''):            
            list_to_append_to.append(unicodedata.normalize("NFKD", entity.text))
    
    if isinstance(entity, collections.Iterable) == False:
            for i in entity.children:
                recursive_check(entity.children)

stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

def show_prediction_for_article(row_num=0):

    bow_vector = dictionary.doc2bow(preprocess(df_clean.loc[row_num]['article']))

    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
        print("Score: {}\t{}\t Topic: {}".format(score, index, lda_model.print_topic(index, 3)))
    print()
    print(df_clean.loc[row_num]['title'])
    print(df_clean.loc[row_num]['teaser_text'])
    print()
    print(df_clean.loc[row_num]['article'])

# Load data

In [3]:
data = pd.read_pickle('data/relationship_posts.pkl')
data.shape

(4418, 7)

# Clean articles

In [4]:
# Cleaned articles will be stored here
cleaned_articles = []

for index , row in data.iterrows():
    temp_list = []
    for i in (BeautifulSoup(row['article_div'].replace('\n', ''))):
        recursive_check(i, temp_list)
       
    cleaned_articles.append([index, temp_list[0]])
    
cleaned_articles_df = pd.DataFrame(cleaned_articles, columns=['index', 'article']).set_index('index', drop=True)
cleaned_articles_df.shape

(4418, 1)

In [5]:
# Join cleaned articles back on dataframe
df_clean = data.drop('article_div', axis=1).join(cleaned_articles_df)

# Show sample article
df_clean.iloc[0]['article']

'As I mentioned at the end of my “International Love Yourself Day,” Valentine’s Day is followed by National Singles Awareness Day. I’ve been musing on that ever since.I like to parse the language we use to adumbrate bigger things, my favorite being losing weight (“Honey, have you seen my weight? I left it by the door 15 minutes ago and now I can’t find it!”). In the case of National Singles Awareness Day, I have couple of inferences to point out.National days, I know from my 25 years of living in a neighborhood that overlooks Manhattan, the Empire State Building is lit in colors commemorating the occasion.  Tonight it will be all red. On St. Patrick’s Day it will be all green. At the end of the month, it will be green and blue for National Eating Disorders Week.Source: Frances KuffelGiven that Singles Awareness Day is a nationally recognized...thing (?)...what color would the Empire State Building be bathed in to...honor (?)...one-half of the population? Beige, maybe? (Are we a group t

# Preprocess docs for LDA

In [6]:
processed_docs = []

for index, row in df_clean.iterrows():
    # Tokenize, lemmatise and stem
    processed_docs.append(preprocess(row['article']))

In [7]:
dictionary = gensim.corpora.Dictionary(processed_docs)
len(dictionary)

31427

In [8]:
dictionary.filter_extremes(no_below=15, no_above=0.5)

In [9]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Train LDA Model

In [10]:
# Load or train model
try:
    lda_model = gensim.models.LdaModel.load('model_lda/lda_model')
    
except:
    lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 10, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)
    lda_model.save('model_lda/lda_model')

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.009*"tell" + 0.008*"say" + 0.007*"status" + 0.007*"friend" + 0.005*"talk" + 0.005*"post" + 0.005*"friendship" + 0.004*"right" + 0.004*"year" + 0.004*"share"


Topic: 1 
Words: 0.028*"date" + 0.014*"friend" + 0.010*"self" + 0.009*"romant" + 0.008*"women" + 0.008*"singl" + 0.008*"attract" + 0.007*"onlin" + 0.006*"meet" + 0.005*"social"


Topic: 2 
Words: 0.011*"self" + 0.008*"connect" + 0.007*"intimaci" + 0.006*"chang" + 0.005*"share" + 0.005*"creat" + 0.005*"coupl" + 0.005*"learn" + 0.005*"sens" + 0.004*"open"


Topic: 3 
Words: 0.006*"face" + 0.006*"brain" + 0.005*"talk" + 0.005*"happi" + 0.005*"gratitud" + 0.005*"posit" + 0.005*"express" + 0.004*"respons" + 0.004*"human" + 0.004*"interact"


Topic: 4 
Words: 0.015*"coupl" + 0.011*"commit" + 0.010*"marriag" + 0.007*"trust" + 0.007*"long" + 0.006*"tell" + 0.006*"year" + 0.006*"say" + 0.005*"husband" + 0.005*"leav"


Topic: 5 
Words: 0.023*"famili" + 0.019*"parent" + 0.013*"mother" + 0.011*"year" + 0.010*"children" + 0

In [11]:
# Use this cell to play with and explore the model a bit
# Just enter an article number (corresponds to df index) that you want to see the predictions for
row = 100

show_prediction_for_article(row)

Score: 0.4125726521015167	8	 Topic: 0.015*"studi" + 0.015*"research" + 0.011*"women"
Score: 0.22193555533885956	4	 Topic: 0.015*"coupl" + 0.011*"commit" + 0.010*"marriag"
Score: 0.1920018494129181	2	 Topic: 0.011*"self" + 0.008*"connect" + 0.007*"intimaci"
Score: 0.10308334976434708	1	 Topic: 0.028*"date" + 0.014*"friend" + 0.010*"self"
Score: 0.06715651601552963	5	 Topic: 0.023*"famili" + 0.019*"parent" + 0.013*"mother"

Why Do People Cheat?
Infidelity has many complicated layers that relationship science is still unpacking.

Source: Diego Cervo/ShutterstockInfidelity is a complicated topic. Depending on how one defines it, some studies report that between 30 and 50 percent of people have engaged in emotional and/or physical intimacy with someone other than their romantic partner.The question of why people step out of their relationships has intrigued me for years. For my dissertation1, I asked young adults who had engaged in extradyadic intimacy to tell the story of why they decided 

In [12]:
def get_top_two_topics(article, dictionary=dictionary, lda_model=lda_model):
    bow_vector = dictionary.doc2bow(preprocess(article))
    l = []

    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1])[:2]:
        l.append(index)
    return l

try:
    df_clean = pd.read_pickle('/data/df_clean_top_pred.pkl')
except:
    df_clean['preds'] = df_clean['article'].apply(get_top_two_topics)
    df_clean['top'] = df_clean['preds'].apply(lambda x: x[0])
    df_clean.to_pickle('/data/df_clean_top_pred.pkl')


df_clean.head()

Unnamed: 0,date,category,title,author,blog_post_link,teaser_text,article,preds,top
0,2017-02-14,Relationships,National Single Awareness Day Comes but Once a...,Frances Kuffel,/intl/blog/what-fat-women-want/201702/national...,Maybe it should be called National Single Undy...,As I mentioned at the end of my “International...,"[5, 3]",5
1,2017-01-31,Relationships,Advice That Works for Anyone,Gleb Tsipursky Ph.D.,/intl/blog/intentional-insights/201701/advice-...,Want to avoid bad advice? You need to read thi...,Written by guest blogger Max HarmsSource: wik...,"[8, 3]",8
2,2017-01-02,Relationships,Recovering From Infidelity Part 4,Linda and Charlie Bloom,/intl/blog/stronger-the-broken-places/201701/r...,The 40 repair steps one committed couple used ...,Source: geralt/pixabay.comLinda: These are the...,"[4, 7]",4
3,2020-01-04,Relationships,Not Falling for That! Resisting Control Disgui...,Lisa Aronson Fontes Ph.D.,/intl/blog/invisible-chains/202001/not-falling...,"""Loving"" acts in an abusive relationship funct...",Written with Laura Marjorie MillerAbusive and ...,"[7, 6]",7
4,2020-01-02,Relationships,Convicted Men and the Women Who Love Them,Robert T Muller Ph.D.,/intl/blog/talking-about-trauma/202001/convict...,What can be incomprehensible to many can be al...,"Source: angus mcdiarmid at Flicker, Creative C...","[7, 8]",7


In [13]:
# Which articles were only categorized as containing one single topic?
df_clean['only_one_topic'] = df_clean['preds'].apply(lambda x: True if len(x) < 2 else False)
df_clean[df_clean['only_one_topic'] == True].head()

Unnamed: 0,date,category,title,author,blog_post_link,teaser_text,article,preds,top,only_one_topic
444,2016-05-28,Relationships,Three Steps To Getting Out of a Toxic Relation...,Jeffrey Bernstein Ph.D.,/intl/blog/liking-the-child-you-love/201605/th...,Are you struggling with criticism and contempt...,Are you in a toxic relationship? You did not g...,[6],6,True
698,2020-02-06,Relationships,Study: 3 Types of Pick-Up Lines Women Can Use ...,"Jeremy Nicholson M.S.W., Ph.D.",/intl/blog/the-attraction-doctor/202002/study-...,How do men respond to different pick-up lines ...,"In a previous article, I discussed the science...",[8],8,True
790,2020-01-19,Relationships,Do Attractive Social Media Profiles Hinder Com...,Karen Wu Ph.D.,/intl/blog/the-modern-heart/202001/do-attracti...,Do attractive social media profiles pose a thr...,Attractive alternatives on social media may po...,[8],8,True
810,2018-02-24,Relationships,"Belonging Is Our Blessing, Tribalism Is our Bu...",Saul Levine M.D.,/intl/blog/our-emotional-footprint/201802/belo...,Feeling that we belong is crucial to our quali...,"We humans are a social species, tribal by natu...",[6],6,True
900,2016-09-16,Relationships,3 Simple Questions to Improve Your Relationship,"Amie M. Gordon, Ph.D.",/intl/blog/between-you-and-me/201609/3-simple-...,We are wired to be selfish—to think about our ...,Source: Peter Drier/FlickrTake a minute to thi...,[7],7,True


In [14]:
# After sighting the predictions and the headlines of the corresponding articles 
# this is how I interpret the different topics:
# 1 - Dating
# 2 - Self-help
# 3 - Self-help with focus on self-improvment
# 4 - Partnership
# 5 - Family
# 6 - Self-help with focus on problem solving
# 7 - Negativity
# 8 - Studies / reasearch
# 9 - Break ups / healing / friendship

In [15]:
# Show which article titles there are for a certain topic
# Use this interactivley by changing the number of 'topic_number'
topic_number = 0

print(lda_model.print_topic(topic_number, 40), end='\n\n')

for index, row in df_clean[df_clean['top'] == topic_number].head(80).iterrows():
    print(row['title'])
    #print(row['teaser_text'])

0.009*"tell" + 0.008*"say" + 0.007*"status" + 0.007*"friend" + 0.005*"talk" + 0.005*"post" + 0.005*"friendship" + 0.004*"right" + 0.004*"year" + 0.004*"share" + 0.004*"valentin" + 0.004*"blog" + 0.004*"moment" + 0.004*"ask" + 0.003*"give" + 0.003*"date" + 0.003*"stori" + 0.003*"famili" + 0.003*"studi" + 0.003*"high" + 0.003*"question" + 0.003*"play" + 0.003*"charact" + 0.003*"write" + 0.003*"inform" + 0.003*"romant" + 0.003*"chang" + 0.003*"book" + 0.003*"silenc" + 0.003*"celebr" + 0.003*"fact" + 0.003*"posit" + 0.003*"see" + 0.003*"facebook" + 0.003*"point" + 0.003*"call" + 0.002*"media" + 0.002*"long" + 0.002*"have" + 0.002*"news"

Be My Valentine - And Save Our Democracy!
How to Keep Romantic Comedies From Ruining Your Love Life
13 Ways to Say You're Really in Love
Searching for Beneficiaries May Surprise and Shock You
Trump, Love, and the Presidency
Tough Problems: Relationships in the Time of Coronavirus
5 Ways to Use Romance to Build and Sustain Your Marriage
Learning to Meet in 

In [16]:
# Let's investigate the topics that only have one predicted topic and see how well they fit
# the self made interpretations of these topics
topic_dic = { 0: 'Unknown',
              1: 'Dating', 
              2: 'Self-help', 
              3: 'Self-help with focus on self-improvment', 
              4: 'Partnership', 
              5: 'Family',
              6: 'Self-help with focus on problem solving', 
              7: 'Negativity', 
              8: 'Studies / reasearch', 
              9: 'Break ups / healing / friendship'}

# Which articles were only categorized as containing one single topic?
df_clean['category_name'] = df_clean['top'].apply(lambda x: topic_dic[int(x)])

In [17]:
investigate_topic = 9

inspect_def = df_clean[(df_clean['only_one_topic'] == True) &  ((df_clean['top'] == investigate_topic))]
print(f'Rows: {inspect_def.shape[0]}')
print(f'Topic: {topic_dic[investigate_topic]}', end='\n\n')

for index, row in inspect_def.iterrows():
    print(row['title'])

Rows: 8
Topic: Break ups / healing / friendship

5 Ways to Stop Being a Narcissistic Friend
Toxic Friendships
Don Rickles: Can Humor Go Too Far?
When Unending Love Ends
The Past Is Always About the Present
How to Help the Empath 
Proximity and Preference – Why We Like Who We Are Close To
 Relationships and Intimacy in Eating Disorder Recovery


In [18]:
# Let's see which words in the titles of the articles occur most often for a topic
word_topic_relationship = []
for index, row in df_clean.iterrows():
    for word in preprocess(row['title']):
        word_topic_relationship.append([word, row['top'], row['category_name']])
    
word_topic_relationship_df = pd.DataFrame(word_topic_relationship, columns=['word', 'top', 'category_name'])

In [19]:
investigate_topic = 8
print(f'Topic: {topic_dic[investigate_topic]}', end='\n\n')

word_topic_relationship_df.head()
word_topic_relationship_df[word_topic_relationship_df['top'] == investigate_topic]['word'].value_counts()[:10]

Topic: Studies / reasearch



relationship    135
love             71
partner          63
date             47
peopl            35
romant           34
attract          33
way              31
women            25
onlin            24
Name: word, dtype: int64

In [20]:
df_clean['category_name'].value_counts()

Negativity                                 943
Studies / reasearch                        650
Family                                     643
Self-help                                  577
Partnership                                369
Break ups / healing / friendship           360
Dating                                     274
Self-help with focus on self-improvment    245
Self-help with focus on problem solving    193
Unknown                                    164
Name: category_name, dtype: int64

# Exploring a subset of the data
After reviewing the models and getting the impression that especially for topic 9 there should be 3 sub topics, namely "friendships", "break-ups", "healing", I wanted to find out if an lda model would find these if it was only trained on topic 9 articles.

In [21]:
only_topic_9_df = df_clean[df_clean['top'] == 9]
only_topic_9_df = only_topic_9_df.drop(['preds', 'top'], axis=1)

processed_docs_t9 = []

for index, row in only_topic_9_df.iterrows():
    # Tokenize, lemmatise and stem
    processed_docs_t9.append(preprocess(row['article']))
    
dictionary_2 = gensim.corpora.Dictionary(processed_docs_t9)
dictionary_2.filter_extremes(no_below=15, no_above=0.5)
bow_corpus_2 = [dictionary_2.doc2bow(doc) for doc in processed_docs_t9]

lda_model_2 =  gensim.models.LdaMulticore(bow_corpus_2, 
                                   num_topics = 3, 
                                   id2word = dictionary_2,                                    
                                   passes = 10,
                                   workers = 2)

for idx, topic in lda_model_2.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

def get_top_two_topics_2(article):
    bow_vector = dictionary_2.doc2bow(preprocess(article))
    l = []
    
    for index, score in sorted(lda_model_2[bow_vector], key=lambda tup: -1*tup[1])[:2]:
        l.append(index)
    return l

only_topic_9_df['preds'] = only_topic_9_df['article'].apply(get_top_two_topics_2)
only_topic_9_df['top'] = only_topic_9_df['preds'].apply(lambda x: x[0])
only_topic_9_df.head()

Topic: 0 
Words: 0.060*"twin" + 0.009*"understand" + 0.008*"differ" + 0.008*"close" + 0.007*"parent" + 0.007*"heart" + 0.007*"problem" + 0.006*"fight" + 0.006*"sister" + 0.006*"separ"


Topic: 1 
Words: 0.024*"friendship" + 0.011*"social" + 0.008*"connect" + 0.006*"chang" + 0.005*"phone" + 0.005*"text" + 0.005*"best" + 0.005*"differ" + 0.005*"spend" + 0.005*"famili"


Topic: 2 
Words: 0.010*"self" + 0.008*"toxic" + 0.007*"past" + 0.006*"pain" + 0.006*"breakup" + 0.006*"break" + 0.006*"memori" + 0.005*"behavior" + 0.005*"look" + 0.005*"leav"




Unnamed: 0,date,category,title,author,blog_post_link,teaser_text,article,only_one_topic,category_name,preds,top
6,2016-02-26,Relationships,8 Ways to Recover from a Breakup,Jean Kim M.D.,/intl/blog/culture-shrink/201602/8-ways-recove...,Breakups are a common part of the human experi...,Source: arvitalyaa/ShutterstockRomantic breaku...,False,Break ups / healing / friendship,[2],2
24,2016-07-31,Relationships,The Wedding Season Friendship Strain (And How ...,Mariana Bockarova Ph.D.,/intl/blog/romantically-attached/201607/the-we...,Planning a wedding? Read this before announcin...,Source: Stockpic/Pexels CC0After six months of...,False,Break ups / healing / friendship,[1],1
46,2016-03-25,Relationships,5 Ways to Bring More Intimacy into Your Life,Jill P. Weber Ph.D.,/intl/blog/having-sex-wanting-intimacy/201603/...,"More than personal successes or riches, people...",Source: Peter Bernik/ShutterstockEmotional int...,False,Break ups / healing / friendship,"[1, 2]",1
48,2019-12-03,Relationships,Seven Tricks to Help You Get Past a Failed Rom...,Suzanne Degges-White Ph.D.,/intl/blog/lifetime-connections/201912/seven-t...,"When a relationship is over, it isn’t always e...","If a breakup is “for real,” it’s best to let g...",False,Break ups / healing / friendship,"[2, 1]",2
68,2016-10-31,Relationships,"Tiger Woods & Elin Nordegren: Time Apart, Heal...",Jane Greer Ph.D.,/intl/blog/shrink-wrap/201610/tiger-woods-elin...,Forever family?,Source: Getty ImagesTiger Woods opened up abou...,False,Break ups / healing / friendship,"[1, 2]",1


In [22]:
# Show which article titles there are for a certain topic
# Use this interactivley by changing the number of 'topic_number'
topic_number = 1

print(lda_model_2.print_topic(topic_number, 40), end='\n\n')

for index, row in only_topic_9_df[only_topic_9_df['top'] == topic_number].head(20).iterrows():
    print(row['title'])
    #print(row['teaser_text'])

0.024*"friendship" + 0.011*"social" + 0.008*"connect" + 0.006*"chang" + 0.005*"phone" + 0.005*"text" + 0.005*"best" + 0.005*"differ" + 0.005*"spend" + 0.005*"famili" + 0.005*"close" + 0.005*"activ" + 0.004*"mean" + 0.004*"convers" + 0.004*"place" + 0.004*"support" + 0.004*"say" + 0.004*"better" + 0.004*"care" + 0.004*"long" + 0.004*"happi" + 0.004*"question" + 0.004*"start" + 0.004*"meet" + 0.004*"year" + 0.004*"communic" + 0.004*"look" + 0.004*"happen" + 0.004*"have" + 0.004*"tell" + 0.003*"research" + 0.003*"coupl" + 0.003*"get" + 0.003*"sure" + 0.003*"student" + 0.003*"women" + 0.003*"expect" + 0.003*"face" + 0.003*"interact" + 0.003*"date"

The Wedding Season Friendship Strain (And How to Avoid It)
5 Ways to Bring More Intimacy into Your Life
Tiger Woods & Elin Nordegren: Time Apart, Healing Betrayal?
5 Simple Steps to Better Relationships
Beyoncé and Jay-Z: A Balancing Act?
The Hard Work of Social Distancing
Talking Instead of Texting Can Save Relationships
How to Make Friends in 

In [23]:
df_clean.head()

Unnamed: 0,date,category,title,author,blog_post_link,teaser_text,article,preds,top,only_one_topic,category_name
0,2017-02-14,Relationships,National Single Awareness Day Comes but Once a...,Frances Kuffel,/intl/blog/what-fat-women-want/201702/national...,Maybe it should be called National Single Undy...,As I mentioned at the end of my “International...,"[5, 3]",5,False,Family
1,2017-01-31,Relationships,Advice That Works for Anyone,Gleb Tsipursky Ph.D.,/intl/blog/intentional-insights/201701/advice-...,Want to avoid bad advice? You need to read thi...,Written by guest blogger Max HarmsSource: wik...,"[8, 3]",8,False,Studies / reasearch
2,2017-01-02,Relationships,Recovering From Infidelity Part 4,Linda and Charlie Bloom,/intl/blog/stronger-the-broken-places/201701/r...,The 40 repair steps one committed couple used ...,Source: geralt/pixabay.comLinda: These are the...,"[4, 7]",4,False,Partnership
3,2020-01-04,Relationships,Not Falling for That! Resisting Control Disgui...,Lisa Aronson Fontes Ph.D.,/intl/blog/invisible-chains/202001/not-falling...,"""Loving"" acts in an abusive relationship funct...",Written with Laura Marjorie MillerAbusive and ...,"[7, 6]",7,False,Negativity
4,2020-01-02,Relationships,Convicted Men and the Women Who Love Them,Robert T Muller Ph.D.,/intl/blog/talking-about-trauma/202001/convict...,What can be incomprehensible to many can be al...,"Source: angus mcdiarmid at Flicker, Creative C...","[7, 8]",7,False,Negativity
