## Topic Modeling

In [8]:
import os, csv, nltk, lda
import pandas as pd
import numpy as np
import string
from nltk.tokenize import PunktSentenceTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import PunktSentenceTokenizer,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/domitillechambon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/domitillechambon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/domitillechambon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Cleaning Data

In [4]:
# Read in CombinedTop100Scrape
raw = pd.read_csv("CombinedTop100Scrape.csv")
raw = raw.drop("Unnamed: 0", axis= 1)

In [9]:
# Function to remove punctuation
def punctuation_extermination(s):
    o = str.maketrans('', '', string.punctuation)
    return str(s).translate(o)

# Remove stopwords from columns within dataframes
def removeStopWords(x):
    t = x.split()
    newTweet = []
    for word in t:
        if word not in stopwords.words("english"):
            newTweet.append(word)
    newTweet = ' '.join(newTweet)
    return newTweet

# Cleaning Post Text column
raw['Post Text'] = raw['Post Text'].apply(punctuation_extermination)
raw['Post Text'] = raw['Post Text'].apply(lambda i: i.lower())
raw['Post Text'] = raw['Post Text'].apply(lambda i: (removeStopWords(i)))

In [13]:
# Cleaning Post Text column
raw['Title'] = raw['Title'].apply(punctuation_extermination)
raw['Title'] = raw['Title'].apply(lambda i: i.lower())
raw['Title'] = raw['Title'].apply(lambda i: (removeStopWords(i)))

In [11]:
# Cleaning Body column
raw['Body'] = raw['Body'].apply(punctuation_extermination)
raw['Body'] = raw['Body'].apply(lambda i: i.lower())
raw['Body'] = raw['Body'].apply(lambda i: (removeStopWords(i)))

In [15]:
# Exporting cleaned data
raw.to_csv("CleanedTitleComments.csv")

#### Topic Modeling Titles

In [51]:
# Read in Cleaned Data
cleanData = pd.read_csv("CleanedTitleComments.csv")
cleanData = cleanData.drop("Unnamed: 0", axis= 1)
cleanData["index"] = range(1, len(cleanData) + 1)

In [52]:
# Creating dfTitle dataframe
dfTitle = cleanData[["index", "Title"]]
dfTitle = dfTitle.rename(columns= {"index": "id", "Title": "Labels"})

In [53]:
#checking for nulls if present any
print("Number of rows with any of the empty columns:")
print(dfTitle.isnull().sum().sum())
dfTitle = dfTitle.dropna()

Number of rows with any of the empty columns:
0


In [54]:
# Variables for two column names and number of topics
restaurant_name = "id"
restaurant_review = "Labels"
ntopics= 5

In [55]:
# Actualizing functions
word_tokenizer=RegexpTokenizer(r'\w+')
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_nltk=set(stopwords.words('english'))

In [56]:
# Function that tokenizes the text
def tokenize_text(version_desc):
    lowercase=version_desc.lower()
    text = wordnet_lemmatizer.lemmatize(lowercase)
    tokens = word_tokenizer.tokenize(text)
    return tokens

In [57]:
# Word to Vec variable created and total number of feeatures and words
vec_words = CountVectorizer(tokenizer=tokenize_text,stop_words=stopwords_nltk,decode_error='ignore')
total_features_words = vec_words.fit_transform(dfTitle[restaurant_review])

print(total_features_words.shape)

(55192, 404)




In [58]:
# Actualizing the model
model = lda.LDA(n_topics=int(ntopics), n_iter=500, random_state=1)
model.fit(total_features_words)

INFO:lda:n_documents: 55192
INFO:lda:vocab_size: 404
INFO:lda:n_words: 355874
INFO:lda:n_topics: 5
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -2796649
INFO:lda:<10> log likelihood: -1761363
INFO:lda:<20> log likelihood: -1661015
INFO:lda:<30> log likelihood: -1644694
INFO:lda:<40> log likelihood: -1640104
INFO:lda:<50> log likelihood: -1639887
INFO:lda:<60> log likelihood: -1640146
INFO:lda:<70> log likelihood: -1639807
INFO:lda:<80> log likelihood: -1639399
INFO:lda:<90> log likelihood: -1638963
INFO:lda:<100> log likelihood: -1639293
INFO:lda:<110> log likelihood: -1639030
INFO:lda:<120> log likelihood: -1638268
INFO:lda:<130> log likelihood: -1638459
INFO:lda:<140> log likelihood: -1638174
INFO:lda:<150> log likelihood: -1638009
INFO:lda:<160> log likelihood: -1638435
INFO:lda:<170> log likelihood: -1637680
INFO:lda:<180> log likelihood: -1637959
INFO:lda:<190> log likelihood: -1637871
INFO:lda:<200> log likelihood: -1637482
INFO:lda:<210> log likelihood: -1638098
INFO:lda:<2

<lda.lda.LDA at 0x165af0d30>

In [59]:
topic_word = model.topic_word_
doc_topic=model.doc_topic_
doc_topic=pd.DataFrame(doc_topic)
dfTitle=dfTitle.join(doc_topic)
restaurant=pd.DataFrame()

In [60]:
for i in range(int(ntopics)):
    topic="topic_"+str(i)
    restaurant[topic]=dfTitle.groupby([restaurant_name])[i].mean()

In [61]:
restaurant=restaurant.reset_index()
topics=pd.DataFrame(topic_word)
topics.columns=vec_words.get_feature_names()
topics1=topics.transpose()
print ("Topics word distribution written in file Title - topic_word_dist.csv ")
topics1.to_csv("Title - topic_word_dist.csv")
restaurant.to_csv("Title - document_topic_dist.csv",index=False)
print ("Document topic distribution written in file Title - document_topic_dist.csv ")



Topics word distribution written in file Title - topic_word_dist.csv 
Document topic distribution written in file Title - document_topic_dist.csv 


#### Topic Modeling Post Text

In [62]:
# Creating dfPostText dataframe
dfPostText = cleanData[["index", "Post Text"]]
dfPostText = dfPostText.rename(columns= {"index": "id", "Post Text": "Labels"})

In [63]:
#checking for nulls if present any
print("Number of rows with any of the empty columns:")
print(dfPostText.isnull().sum().sum())
dfPostText = dfPostText.dropna()

Number of rows with any of the empty columns:
0


In [64]:
# Variables for two column names and number of topics
restaurant_name = "id"
restaurant_review = "Labels"
ntopics= 5

In [65]:
# Actualizing functions
word_tokenizer=RegexpTokenizer(r'\w+')
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_nltk=set(stopwords.words('english'))

In [66]:
# Word to Vec variable created and total number of feeatures and words
vec_words = CountVectorizer(tokenizer=tokenize_text,stop_words=stopwords_nltk,decode_error='ignore')
total_features_words = vec_words.fit_transform(dfPostText[restaurant_review])

print(total_features_words.shape)



(55192, 2768)


In [67]:
# Actualizing the model
model = lda.LDA(n_topics=int(ntopics), n_iter=500, random_state=1)
model.fit(total_features_words)

INFO:lda:n_documents: 55192
INFO:lda:vocab_size: 2768
INFO:lda:n_words: 5017184
INFO:lda:n_topics: 5
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -43660608
INFO:lda:<10> log likelihood: -33729303
INFO:lda:<20> log likelihood: -33044334
INFO:lda:<30> log likelihood: -32897376
INFO:lda:<40> log likelihood: -32784971
INFO:lda:<50> log likelihood: -32767650
INFO:lda:<60> log likelihood: -32755389
INFO:lda:<70> log likelihood: -32743929
INFO:lda:<80> log likelihood: -32726911
INFO:lda:<90> log likelihood: -32703222
INFO:lda:<100> log likelihood: -32688843
INFO:lda:<110> log likelihood: -32678654
INFO:lda:<120> log likelihood: -32664656
INFO:lda:<130> log likelihood: -32657884
INFO:lda:<140> log likelihood: -32653949
INFO:lda:<150> log likelihood: -32642477
INFO:lda:<160> log likelihood: -32639964
INFO:lda:<170> log likelihood: -32629277
INFO:lda:<180> log likelihood: -32630111
INFO:lda:<190> log likelihood: -32630955
INFO:lda:<200> log likelihood: -32619981
INFO:lda:<210> log likelihoo

<lda.lda.LDA at 0x2941d5a80>

In [68]:
topic_word = model.topic_word_
doc_topic=model.doc_topic_
doc_topic=pd.DataFrame(doc_topic)
dfPostText=dfPostText.join(doc_topic)
restaurant=pd.DataFrame()

In [69]:
for i in range(int(ntopics)):
    topic="topic_"+str(i)
    restaurant[topic]=dfPostText.groupby([restaurant_name])[i].mean()

In [70]:
restaurant=restaurant.reset_index()
topics=pd.DataFrame(topic_word)
topics.columns=vec_words.get_feature_names()
topics1=topics.transpose()
print ("Topics word distribution written in file Post Text - topic_word_dist.csv ")
topics1.to_csv("Post Text - topic_word_dist.csv")
restaurant.to_csv("Post Text - document_topic_dist.csv",index=False)
print ("Document topic distribution written in file Post Text - document_topic_dist.csv ")



Topics word distribution written in file Post Text - topic_word_dist.csv 
Document topic distribution written in file Post Text - document_topic_dist.csv 


#### Topic Modeling Comments

In [71]:
# Creating dfBody dataframe
dfBody = cleanData[["index", "Body"]]
dfBody = dfBody.rename(columns= {"index": "id", "Body": "Labels"})

In [72]:
#checking for nulls if present any
print("Number of rows with any of the empty columns:")
print(dfBody.isnull().sum().sum())
dfBody = dfBody.dropna()

Number of rows with any of the empty columns:
337


In [73]:
# Variables for two column names and number of topics
restaurant_name = "id"
restaurant_review = "Labels"
ntopics= 5

In [74]:
# Actualizing functions
word_tokenizer=RegexpTokenizer(r'\w+')
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_nltk=set(stopwords.words('english'))

In [75]:
# Word to Vec variable created and total number of feeatures and words
vec_words = CountVectorizer(tokenizer=tokenize_text,stop_words=stopwords_nltk,decode_error='ignore')
total_features_words = vec_words.fit_transform(dfBody[restaurant_review])

print(total_features_words.shape)



(54855, 33076)


In [76]:
# Actualizing the model
model = lda.LDA(n_topics=int(ntopics), n_iter=500, random_state=1)
model.fit(total_features_words)

INFO:lda:n_documents: 54855
INFO:lda:vocab_size: 33076
INFO:lda:n_words: 998891
INFO:lda:n_topics: 5
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -9931262
INFO:lda:<10> log likelihood: -8687354
INFO:lda:<20> log likelihood: -8566889
INFO:lda:<30> log likelihood: -8496695
INFO:lda:<40> log likelihood: -8437679
INFO:lda:<50> log likelihood: -8392631
INFO:lda:<60> log likelihood: -8358450
INFO:lda:<70> log likelihood: -8333913
INFO:lda:<80> log likelihood: -8315751
INFO:lda:<90> log likelihood: -8301897
INFO:lda:<100> log likelihood: -8292562
INFO:lda:<110> log likelihood: -8285331
INFO:lda:<120> log likelihood: -8277307
INFO:lda:<130> log likelihood: -8270391
INFO:lda:<140> log likelihood: -8267794
INFO:lda:<150> log likelihood: -8266851
INFO:lda:<160> log likelihood: -8263048
INFO:lda:<170> log likelihood: -8259582
INFO:lda:<180> log likelihood: -8256876
INFO:lda:<190> log likelihood: -8254373
INFO:lda:<200> log likelihood: -8254750
INFO:lda:<210> log likelihood: -8253102
INFO:lda:

<lda.lda.LDA at 0x2941d6f50>

In [77]:
topic_word = model.topic_word_
doc_topic=model.doc_topic_
doc_topic=pd.DataFrame(doc_topic)
dfBody=dfBody.join(doc_topic)
restaurant=pd.DataFrame()

In [78]:
for i in range(int(ntopics)):
    topic="topic_"+str(i)
    restaurant[topic]=dfBody.groupby([restaurant_name])[i].mean()

In [79]:
restaurant=restaurant.reset_index()
topics=pd.DataFrame(topic_word)
topics.columns=vec_words.get_feature_names()
topics1=topics.transpose()
print ("Topics word distribution written in file Body - topic_word_dist.csv ")
topics1.to_csv("Body - topic_word_dist.csv")
restaurant.to_csv("Body - document_topic_dist.csv",index=False)
print ("Document topic distribution written in file Body - document_topic_dist.csv ")



Topics word distribution written in file Body - topic_word_dist.csv 
Document topic distribution written in file Body - document_topic_dist.csv 


## Quartile & Topic Weights based on Titles

In [82]:
# Finding first and third quartiles
print("First quartile:", cleanData["Title Score"].quantile([.25]))
print("Third quartile:", cleanData["Title Score"].quantile([.75]))

First quartile: 0.25    2230.0
Name: Title Score, dtype: float64
Third quartile: 0.75    3664.0
Name: Title Score, dtype: float64


In [83]:
# Setting first and third quartile scores
firstQuartile = float(2230)
thirdQuartile = float(3664)

# Dataframe with values from first and third quartiles
dfQuartile = cleanData[(cleanData["Title Score"] <= firstQuartile) | (cleanData["Title Score"] >= thirdQuartile)]

In [84]:
# Identifying quartile
def quart(val):
    """ Determine which quartile the image belongs to"""
    if val >= thirdQuartile:
        return "third"
    elif val <= firstQuartile:
        return "first"

In [88]:
dfQuartile.head(1)

Unnamed: 0,Title,Post Text,ID,Title Score,Total Comments,Post URL,User,Body,Comment Score,index
0,someone really likes it’s effortless,keep mind text right away soon excited make pl...,sgcach,11302,508,https://www.reddit.com/r/dating_advice/comment...,AutoModerator,welcome rdatingadvice please keep ruleshttpsww...,1,1


In [90]:
# Labeling first or third quartile
dfQuartile["Quartile"] = dfQuartile["Title Score"].apply(lambda x: quart(x))

# Excluding unnecessary columns from dataframe
dfQuartile = dfQuartile.drop(["Post URL", "index"], axis= 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfQuartile["Quartile"] = dfQuartile["Title Score"].apply(lambda x: quart(x))


In [92]:
# Reading in topic modeling csv
topicModels = pd.read_csv("Title - document_topic_dist.csv")

# Concatenating two dataframes
dfComb = pd.concat([dfQuartile, topicModels], axis= 1)

# Removing unnecessary columns
dfComb = dfComb[["Quartile", "topic_0", "topic_1", "topic_2", "topic_3", "topic_4"]]

In [94]:
# Averaging topic values and grouping by first and third quartile
dfAverage = dfComb.groupby("Quartile").mean()

In [95]:
# Average topic values per 2 quartiles pre topic naming
dfAverage

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4
Quartile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
first,0.147653,0.32894,0.238324,0.053237,0.231846
third,0.30448,0.126188,0.179941,0.247426,0.141964


<h4> <strong> Topics: </strong> </h4>
<p> <strong> topic_0 - </strong> Red flags on first date </p>
<p> <strong> topic_1 - </strong> Women having problem with men being too interested in sex </p>
<p> <strong> topic_2 - </strong> Sexual insecurities </p>
<p> <strong> topic_3 - </strong> Men focuesed on attractiveness </p>
<p> <strong> topic_4 - </strong> Difficulties surrounding dating </p>

</br>

<h4> <strong> Topic Value Assessment: </strong> </h4>
<p> There are three significant differences between the two engagement quartiles in regard to the following topics: topic_0, topic_1, topic_3. Topic_0 and topic_3 have larger proportions in the third quartile (the higher engagement quartile). Topic_2 has a larger proportion in the first quartile (the lower engagement quartile). Topic_2 and topic_4 have larger proportions in the first quartile (the lower engagement quartile). </p>
<p> <strong> Most popular topics: </strong> topic_0 & topic_3 </p>

</br>

<ol>
<h5>High Engagement Keywords:</h5>

<li>Date, First, Said, Red, Sex, Got, Time, Already, Body, Fupa, Negative, Things, Weeks, Flag, Problems, Guy, Friends, Benefits, Creating</li>
<li>Guy, Men, Want, Attractive, Every, Please, Think, Age, Annoying, F, Fucking, Fun, God, Share, Threesome, Every, Women, Went, Another</li>


<h5>Low Engagement Keywords:</h5>

<li>Woman, Sex, Men, Reddit, Turn, Dates, Dating, Stop, Talk, I'm, Wants, Please, People, Talking, Interested, Still, Soon, Nice, Tell, 25f</li>
<li>First, Date, Girl, Would, Lesser, Sex, Think, Feel, Situation, Showed, Make, Move, Tinder, Life, Makeup, Hasn't, Kiss, Know, Let, Micropenis</li>
<li>Women, Dating, Guy, Difficult, Thing, What's, Man, Extremely, Way, Girls, Looks, Beside, Makes, Attractive, You're, Friend, Initiate, Best, Someone, Told</li>
</ol>