In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

import os
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction import text 
from sklearn.decomposition import LatentDirichletAllocation as LDA
from pyLDAvis import sklearn as sklearn_lda
import pyLDAvis

  from collections import Iterable
  from collections import Mapping


In [2]:
# max rows and columns
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
# display all contents in dataframe
pd.option_context("display.max_colwidth", -1, "display.max_rows", None)

<pandas._config.config.option_context at 0x28295379bc8>

In [3]:
df = pd.DataFrame()
for file in os.listdir("../data/"):
    df = pd.concat([df, pd.read_csv("../data/"+file)], axis=0)

In [4]:
df.isnull().sum()

subreddit        0
created_utc      0
body           754
score            0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df['year'] = [datetime.fromtimestamp(utc).year for utc in df['created_utc']]
df['month'] = [datetime.fromtimestamp(utc).month for utc in df['created_utc']]

In [7]:
df['year_month'] = df['year'].map(str) + '-' + df['month'].map(str)
df['year_month'] = pd.to_datetime(df['year_month'], format='%Y-%m').dt.strftime('%Y-%m')

In [8]:
df.drop(columns=['created_utc'], axis=1, inplace=True)

In [9]:
df['subreddit'].unique()

array(['HeadphoneAdvice', 'headphones'], dtype=object)

In [17]:
# drop 2020 data as the portion is small
df.drop(df[df['year'] == 2020].index, axis=0, inplace=True)

In [20]:
years = df[df['subreddit']!='HeadphoneAdvice']['year'].unique()

In [11]:
df.head()

Unnamed: 0,subreddit,body,score,year,month,year_month
0,HeadphoneAdvice,[PA] Upgrade from HyperX cloud 2's.I have been...,1,2020,2,2020-02
1,HeadphoneAdvice,"Neutral, USB stick style DAC, $100-200 range. ...",1,2020,2,2020-02
2,HeadphoneAdvice,Sennheiser HD 4.5 BTNC vs JBL Live 650BTNCLook...,1,2020,2,2020-02
3,HeadphoneAdvice,Best General Use IEMS under 200?I've been look...,1,2020,2,2020-02
4,HeadphoneAdvice,What gaming headphones should I buy?I have use...,1,2020,2,2020-02


In [12]:
stopwords_final = text.ENGLISH_STOP_WORDS.union({'https', 'http', 'www', 'com', 'headphones', 'headphone', 'headphoneadvice', 'earphones', 'earphone', 'org', 'reddit', 've', 'just'})

In [13]:
stopwords_final

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [14]:
cvec = CountVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9, max_features=20000, stop_words=stopwords_final)
X = cvec.fit_transform(df['body'])

In [23]:
# Create LDA visualization and return trained LDA model
def get_LDA(n_topics, training_data, vectorized, file_name):
    # Create and fit the LDA model
    model = LDA(n_components=n_topics, n_jobs=4)
    model.fit(training_data)
    
    # Create LDA visualization
    LDAvis_prepared = sklearn_lda.prepare(model, training_data, vectorized)
    pyLDAvis.save_html(LDAvis_prepared, '../templates/ldavis_prepared_'+ file_name + '_' + str(n_topics) +'.html')
    
    return model

In [25]:
n_topics = [3, 5, 7]
data = {}
model = {}

# Model by year
for year in years:
    for num in n_topics:
        data[str(year)+'_'+str(num)] = cvec.transform(df[df['year']==year]['body'])
        model[str(year)+'_'+str(num)] = get_LDA(num, data[str(year)+'_'+str(num)], cvec, str(year))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept t

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept t

In [115]:
n_topics = 7

X_b_2017 = cvec.transform(df[df['year']<=2017]['body'])
X_2018 = cvec.transform(df[df['year']==2018]['body'])
X_a_2019 = cvec.transform(df[df['year']>=2019]['body'])

lda_b_2017 = get_LDA(n_topics, X_b_2017, cvec, 'b_2017')
lda_2018 = get_LDA(n_topics, X_2018, cvec, '2018')
lda_a_2019 = get_LDA(n_topics, X_a_2019, cvec, 'a_2019')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [116]:
def get_top_words(n_words, model, vectorized):
    top_compo = []
    exp_compo = []
    for i, topic in enumerate(model.exp_dirichlet_component_):
        words = np.array(vectorized.get_feature_names())[np.argsort(topic)][:-(n_words+1):-1]
        top_compo.append([word for word in words])
        exp_compo.append(sum(np.sort(topic)[:-(n_words+1):-1]))
        
    return pd.concat([pd.DataFrame(exp_compo, columns=['pca_top'+str(n_words)]), pd.DataFrame(top_compo)], axis = 1)

In [117]:
n_words = 30
df_2017 = get_top_words(n_words, lda_b_2017, cvec).sort_values(by='pca_top'+str(n_words), ascending = False)
df_2018 = get_top_words(n_words, lda_2018, cvec).sort_values(by='pca_top'+str(n_words), ascending = False)
df_2019 = get_top_words(n_words, lda_a_2019, cvec).sort_values(by='pca_top'+str(n_words), ascending = False)

In [118]:
df_2017

Unnamed: 0,pca_top30,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
6,0.286426,amp,removed,ath,audio,amazon,dac,pro,technica,audio technica,dt,beyerdynamic,deleted,akg,dp,need,gaming,vs,technica ath,fiio,sennheiser,ref,m50x,dac amp,looking,ohm,good,utf8,utf8 amp,sound,amp dac
4,0.249564,guide,purchase,head fi,fi,head,thread,looking,buying,buying guide,gaming,comments,google,spreadsheet,amp,format,sure,read,purchase advice,advice,posting,make,docs,docs google,alternatives,make sure,check,50,beats,budget,wiki
1,0.236715,removed,cable,ear,sennheiser,headset,looking,pair,good,help,wireless,hd,mic,need,earbuds,bluetooth,sony,best,quality,use,noise,sound,new,gaming,like,buy,just,mdr,deleted,know,sennheiser hd
2,0.195057,music,looking,like,pair,good,listen,sound,ve,really,want,don,need,budget,use,bass,preferred,isolation,ear,quality,beats,know,rock,thanks,listening,just,lot,100,closed,open,price
0,0.167811,sound,audio,just,volume,dac,amp,using,use,like,music,ve,usb,quality,know,schiit,does,setup,phone,pc,don,got,hear,fiio,gt,question,really,way,speakers,problem,difference
3,0.154759,sound,bass,like,just,really,good,better,ve,bit,ears,don,feel,treble,quality,great,mids,listening,sounds,soundstage,little,think,music,iems,head,say,time,end,ear,pretty,review
5,0.145766,just,new,pair,got,ve,pads,know,like,imgur,right,bought,time,ear,guys,left,think,thanks,don,today,ago,really,post,buy,amazon,does,good,used,way,did,people


In [119]:
df_2018

Unnamed: 0,pca_top30,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,0.267456,sony,noise,sennheiser,bose,hd,sound,cancelling,noise cancelling,deleted,wireless,ear,quality,anc,better,best,good,sennheiser hd,mdr,sound quality,wh,price,looking,sony wh,removed,vs,amp,thanks,sony mdr,comfort,qc35
5,0.262613,removed,purchase,advice,purchase advice,thread,kz,questions,post,comments,google,view,gt,subreddit,question,sites,message,thank,amp,rule,asking,help,sites google,google view,assistants,support,bot,dacs,2018,compose,message compose
6,0.215852,wireless,bluetooth,looking,good,pair,use,earbuds,quality,like,mic,don,know,headset,just,ear,need,sound,ve,help,thanks,want,new,phone,really,work,gaming,audio,using,buy,music
1,0.212185,amp,dac,audio,sound,amp x200b,x200b,need,dac amp,gaming,just,use,good,better,amazon,sennheiser,power,schiit,like,fiio,pc,using,dp,amp dac,hd,usb,pro,looking,magni,buy,setup
4,0.200938,music,like,looking,sound,good,bass,budget,open,pair,want,listen,ath,use,audio,listening,closed,gaming,preferred,don,used,really,experience,better,ve,amp,pa,gear,technica,audio technica,rock
3,0.156277,sound,like,bass,just,really,good,ve,don,better,think,music,bit,sounds,treble,people,listening,great,say,hear,high,eq,way,different,lot,little,feel,time,end,pretty,soundstage
2,0.144014,just,cable,pads,ear,like,ve,got,right,audio,sound,new,don,use,tips,head,know,cables,left,bought,thanks,ears,issue,tried,problem,way,sure,pair,replacement,fit,time


In [120]:
df_2019

Unnamed: 0,pca_top30,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
2,0.279814,hd,removed,thanks,sennheiser,gt,purchase,sennheiser hd,jabra,advice,massdrop,rule,looking,elite,post,message,drop,65t,jabra elite,amp,thank,following,google,moderators,asked,view,posts,quick,read,following rule,rule gt
4,0.239717,amp,amp x200b,x200b,bass,like,sound,music,pads,gear,preferred,experience,amazon,listening,used,balance,ear,dp,source,past,rock,listen,tonal,ref,tonal balance,genre,bit,budget,good,isolation,really
1,0.231992,amp,dac,audio,dac amp,need,sound,usb,use,power,pc,just,volume,fiio,using,post,atom,output,schiit,amp dac,setup,work,good,quality,want,jds,like,does,magni,pro,combo
3,0.230295,gaming,looking,music,headset,good,budget,mic,use,open,like,want,need,new,pair,audio,thanks,help,listen,best,don,buy,better,know,games,using,sennheiser,pa,pc,currently,100
5,0.224656,wireless,sound,good,quality,ear,looking,pair,earbuds,sony,noise,bluetooth,like,use,don,ve,sound quality,want,really,bose,need,best,better,just,wired,price,know,new,cancelling,anc,buy
6,0.166762,sound,like,bass,good,just,better,ve,really,don,dt,pair,great,think,bit,price,amp,soundstage,open,heard,pro,music,want,closed,treble,beyerdynamic,sounds,say,people,end,love
0,0.133721,just,like,cable,don,ve,sound,know,right,got,use,really,audio,iems,think,way,good,time,people,does,tips,ear,make,left,new,iem,problem,issue,tried,try,volume
