In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

import os
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction import text 
from sklearn.decomposition import LatentDirichletAllocation as LDA
from pyLDAvis import sklearn as sklearn_lda
import pyLDAvis

# import lda


  from collections import Iterable
  from collections import Mapping


In [2]:
# max rows and columns
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
# display all contents in dataframe
pd.option_context("display.max_colwidth", -1, "display.max_rows", None)

<pandas._config.config.option_context at 0x24adb164c88>

In [3]:
df = pd.DataFrame()
for file in os.listdir("../data/"):
    df = pd.concat([df, pd.read_csv("../data/"+file)], axis=0)

In [4]:
df.isnull().sum()

subreddit        0
created_utc      0
body           754
score            0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df['year'] = [datetime.fromtimestamp(utc).year for utc in df['created_utc']]
df['month'] = [datetime.fromtimestamp(utc).month for utc in df['created_utc']]

In [7]:
df['year_month'] = df['year'].map(str) + '-' + df['month'].map(str)
df['year_month'] = pd.to_datetime(df['year_month'], format='%Y-%m').dt.strftime('%Y-%m')

In [8]:
df.drop(columns=['created_utc'], axis=1, inplace=True)

In [9]:
df['subreddit'].unique()

array(['HeadphoneAdvice', 'headphones'], dtype=object)

In [10]:
df[df['subreddit']!='HeadphoneAdvice']['year'].unique()

array([2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010],
      dtype=int64)

In [11]:
df.head()

Unnamed: 0,subreddit,body,score,year,month,year_month
0,HeadphoneAdvice,[PA] Upgrade from HyperX cloud 2's.I have been...,1,2020,2,2020-02
1,HeadphoneAdvice,"Neutral, USB stick style DAC, $100-200 range. ...",1,2020,2,2020-02
2,HeadphoneAdvice,Sennheiser HD 4.5 BTNC vs JBL Live 650BTNCLook...,1,2020,2,2020-02
3,HeadphoneAdvice,Best General Use IEMS under 200?I've been look...,1,2020,2,2020-02
4,HeadphoneAdvice,What gaming headphones should I buy?I have use...,1,2020,2,2020-02


In [12]:
stopwords_final = text.ENGLISH_STOP_WORDS.union({'https', 'http', 'www', 'com', 'headphones', 'headphone', 'earphones', 'earphone'})

In [13]:
cvec = CountVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9, max_features=20000, stop_words=stopwords_final)
X = cvec.fit_transform(df['body'])

In [45]:
# Create LDA visualization and return trained LDA model
def get_LDA(n_topics, training_data, vectorized, file_name):
    # Create and fit the LDA model
    model = LDA(n_components=n_topics, n_jobs=4)
    model.fit(training_data)
    
    # Create LDA visualization
    LDAvis_prepared = sklearn_lda.prepare(model, training_data, vectorized)
    pyLDAvis.save_html(LDAvis_prepared, '../lda_visualization/ldavis_prepared_'+ file_name + '_' + str(n_topics) +'.html')
    
    return model

In [46]:
n_topics = 7

X_b_2017 = cvec.transform(df[df['year']<=2017]['body'])
X_2018 = cvec.transform(df[df['year']==2018]['body'])
X_a_2019 = cvec.transform(df[df['year']>=2019]['body'])

lda_b_2017 = get_LDA(n_topics, X_b_2017, cvec, 'b_2017')
lda_2018 = get_LDA(n_topics, X_2018, cvec, '2018')
lda_a_2019 = get_LDA(n_topics, X_a_2019, cvec, 'a_2019')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [47]:
def get_top_words(n_words, model, vectorized):
    df_top_words = pd.DataFrame()
    for i, topic in enumerate(model.components_):
        words = np.array(vectorized.get_feature_names())[np.argsort(topic)][:-(n_words+1):-1]
        df_top_words = pd.concat([df_top_words, pd.DataFrame([word for word in words]).T], axis = 0)
    return df_top_words

In [48]:
n_words = 30
df_b_2017 = get_top_words(n_words, lda_b_2017, cvec)
df_2018 = get_top_words(n_words, lda_2018, cvec)
df_a_2019 = get_top_words(n_words, lda_a_2019, cvec)

In [49]:
df_b_2017

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,guide,head fi,purchase,fi,head,thread,org,reddit,fi org,comments,looking,buying,buying guide,gaming,google,spreadsheet,amp,format,sure,advice,read,purchase advice,posting,make,reddit comments,docs,docs google,alternatives,make sure,check
0,looking,music,good,pair,like,sound,ear,quality,listen,need,gaming,removed,want,help,really,best,bass,use,budget,don,ve,noise,isolation,100,wireless,preferred,ath,headset,price,thanks
0,amp,amazon,removed,sennheiser,deleted,new,dp,sony,price,audio,buy,good,ref,ear,ath,mdr,akg,pair,technica,utf8,product,audio technica,utf8 amp,sale,best,massdrop,sr,looking,bluetooth,99
0,amp,sound,just,dac,hd,sennheiser,ve,pair,pro,better,like,good,know,removed,new,dt,fiio,difference,quality,audio,beyerdynamic,music,need,got,thanks,really,question,598,getting,guys
0,sound,bass,like,just,really,good,bit,better,ve,treble,listening,feel,mids,sounds,soundstage,don,think,little,music,great,people,pads,say,review,time,quality,end,high,head,pretty
0,audio,amp,dac,removed,usb,use,headset,mic,pc,volume,schiit,using,sound,magni,need,output,work,modi,setup,5mm,gt,help,does,jack,fiio,speakers,computer,question,power,plug
0,just,cable,ear,ve,like,pair,sound,know,got,right,don,really,new,left,use,problem,bought,help,time,ears,way,tips,thanks,tried,cables,imgur,work,does,iems,make


In [50]:
df_2018

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,removed,cable,ath,audio,technica,audio technica,5mm,cables,m50x,thread,technica ath,reddit,purchase,m40x,help,pro,question,jack,need,use,amp,adapter,bose,comments,replacement,questions,ath m50x,vs,sennheiser,ii
0,amp,music,amp x200b,x200b,like,preferred,budget,amazon,listen,looking,gear,listening,experience,used,bass,rock,use,dp,source,pa,balance,past,tonal,ear,home,tonal balance,ref,sound,using,want
0,gaming,looking,good,headset,sound,pair,music,mic,need,sennheiser,thanks,use,open,want,like,advice,help,new,quality,budget,better,best,know,ve,audio,buy,really,don,purchase,price
0,ear,sound,good,wireless,noise,like,quality,earbuds,ve,don,pair,just,bluetooth,looking,really,use,iems,want,ears,best,bose,better,sound quality,cancelling,iem,great,price,know,noise cancelling,tips
0,sound,like,bass,just,really,good,ve,better,pads,don,bit,think,treble,people,sounds,great,soundstage,music,little,say,different,pretty,feel,listening,way,time,know,mids,heard,lot
0,just,sony,audio,use,sound,thanks,like,know,usb,cable,new,right,got,using,bluetooth,ve,work,phone,sennheiser,bought,does,volume,amazon,way,ll,problem,help,pair,issue,sure
0,amp,dac,dac amp,dt,just,need,power,fiio,hd,schiit,deleted,massdrop,audio,amp dac,ohm,good,magni,impedance,use,pro,better,like,amps,drive,volume,output,setup,don,using,sound


In [51]:
df_a_2019

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,wireless,looking,ear,good,sound,like,earbuds,quality,noise,pair,use,music,bluetooth,don,want,budget,ve,need,really,best,sound quality,just,wired,listen,phone,pa,sony,100,bose,work
0,dt,pro,removed,looking,closed,music,beyerdynamic,open,bass,listen,akg,770,ohm,advice,gt,990,like,purchase,good,dt770,dt 770,rock,250,rule,dt 990,ath,amp,beyerdynamic dt,80,headphoneadvice
0,just,like,don,ve,really,good,sound,know,think,people,got,pair,ll,better,new,time,price,buy,thanks,want,make,try,say,look,pretty,sure,great,going,lot,way
0,amp,amp x200b,x200b,dac,amazon,like,dac amp,music,budget,looking,experience,hd,dp,gear,sennheiser,amp dac,schiit,listening,ref,atom,good,used,preferred,need,jds,pair,upgrade,better,setup,combo
0,dac,amp,audio,use,volume,usb,sound,need,power,just,phone,fiio,cable,using,post,5mm,does,work,output,pc,gt,jack,quality,like,impedance,bluetooth,help,way,adapter,want
0,gaming,headset,mic,sound,good,sennheiser,looking,audio,better,sony,hd,music,quality,use,ve,need,new,pair,know,help,pc,games,thanks,want,ath,like,buy,open,cloud,wh
0,sound,bass,like,iems,ear,just,bit,iem,treble,ve,better,pads,cable,good,really,ears,tips,right,eq,mids,left,soundstage,kz,sounds,little,great,don,tin,t2,fit


In [None]:
n_topics = 7
lad_model = get_LDA(n_topics, X, cvec)

In [None]:
top_words = 30
for i, topic in enumerate(lda_model.components_):
    words = np.array(cvec.get_feature_names())[np.argsort(topic)][:-(top_words+1):-1]
    print('Topic {}: {}'.format(i, '_'.join(words)))