In [4]:
import spacy
import gensim
from gensim import corpora
from gensim.test.utils import datapath
nlp = spacy.load( "en_core_web_sm", disable=["parser", "ner"] )
import pandas

In [2]:
data = pandas.read_csv( 'CHI.tsv', sep='\t', header=0 )
data['TEXT'] = data['TITLE'] + '. ' + data['ABSTRACT']
data

Unnamed: 0,YEAR,TITLE,ABSTRACT,TEXT
0,2000,Intelligent gaze-added interfaces,"We discuss a novel type of interface, the inte...",Intelligent gaze-added interfaces. We discuss ...
1,2000,Evaluation of eye gaze interaction,Eye gaze interaction can provide a convenient ...,Evaluation of eye gaze interaction. Eye gaze i...
2,2000,Enriching buyers' experiences: the SmartClient...,"In electronic commerce, a satisfying buyer exp...",Enriching buyers' experiences: the SmartClient...
3,2000,Quality is in the eye of the beholder: meeting...,Growing usage and diversity of applications on...,Quality is in the eye of the beholder: meeting...
4,2000,What makes Internet users visit cyber stores a...,Retaining customer loyalty is crucial in elect...,What makes Internet users visit cyber stores a...
...,...,...,...,...
4061,1999,Mutual disambiguation of recognition errors in...,As a new generation of multimodal/media system...,Mutual disambiguation of recognition errors in...
4062,1999,Model-based and empirical evaluation of multim...,Our research addresses the problem of error co...,Model-based and empirical evaluation of multim...
4063,1999,Cooperative inquiry: developing new technologi...,"In todays homes and schools, children are emer...",Cooperative inquiry: developing new technologi...
4064,1999,Projected realities: conceptual design for cul...,As a part of a European Union sponsored projec...,Projected realities: conceptual design for cul...


In [3]:
# process a raw text and returns a list of processed word tokens (removes stopwords and punctuations, applies casefolding and stemming)
def text2words(rawtext, nlp):
    text = nlp(rawtext)
    return [ token.lemma_.lower() for token in text if not token.is_stop and not token.is_punct ]

In [5]:
voc = corpora.Dictionary.load( "HCI_topics.id2word" ) # load the dictionary (note it is a file ends with .id2word)
lda = gensim.models.ldamulticore.LdaMulticore.load( "HCI_topics" ) # load the model
lda.num_topics

20

In [6]:
def label_topic(lda, topicid, numwords):
    # write your solution here and change the return value None
    a = lda.show_topic(topicid,numwords)
    #print(a)
    b = [i[0] for i in a]
    #print(b)
    b = " ".join(b)
    b = str(topicid) + ": "+b
    #print(b)
    return b
#label_topic(lda, 0, 5)

In [7]:
topic_labels = { i:label_topic(lda, i, 10) for i in range(lda.num_topics) }
topic_labels

{0: '0: user direct technique manipulation combination control point drawing edit alternative',
 1: '1: model eye pointing task device movement law input gaze fitts',
 2: '2: system user computer language human error use behavior problem usability',
 3: '3: voice usability user whisper labeling trace visualization technique annotation problem',
 4: '4: object physical interaction user computer interface interactive application tool system',
 5: '5: virtual focus space 3d interface environment design user navigation view',
 6: '6: display technique study screen target interaction image present provide visual',
 7: '7: web user search information site page tool task datum system',
 8: '8: device touch gesture input interaction hand user sensor finger technique',
 9: '9: child interactive computer support study blind design work base present',
 10: '10: game video play online experience player adult group motion content',
 11: '11: design process note child new technology group support wo

In [8]:
def yearly_topic_distribution(data, lda, voc, year):
    corpus = [ text2words(text, nlp) for text in data['TEXT'] ]
    corpusbow = [ voc.doc2bow(text) for text in corpus ]
    k = len(data[data.YEAR == year])
    i = 0
    Dict = {}
    while i < lda.num_topics:
        j = 0
        sm = 0
        for ind in data.index:
            if data["YEAR"][ind] == year:
                a = lda.get_document_topics( corpusbow[j] )
                b = [item for item in a if item[0] == i]
                if len(b) == 0:
                    sm += 0
                elif len(b) > 1:
                    sm += b[0][1]
                    print("ohh no")
                else:
                    sm += b[0][1]
            j += 1    
        sm = sm/k
        Dict[i] = sm
        i += 1
        
            
    return Dict

In [9]:
topic_distribution_1981 = yearly_topic_distribution(data, lda, voc, 1981)
topic_distribution_1981

{0: 0.029011486423394038,
 1: 0.010491815883488883,
 2: 0.17919602836408313,
 3: 0.016036028278962014,
 4: 0.0227440336039142,
 5: 0.007873928292639672,
 6: 0.05180557453561397,
 7: 0.07625057019056782,
 8: 0.004128195271487274,
 9: 0.028900657558724993,
 10: 0.017510275223425457,
 11: 0.03426610999223259,
 12: 0.019470258098509576,
 13: 0.042369348854417836,
 14: 0.08665443658237419,
 15: 0.021665934517624833,
 16: 0.008840651901823187,
 17: 0.2367381447305282,
 18: 0.06172846872655172,
 19: 0.03645298510257687}

In [10]:
def overall_topic_distribution(data, lda, voc):
    corpus = [ text2words(text, nlp) for text in data['TEXT'] ]
    corpusbow = [ voc.doc2bow(text) for text in corpus ]
    k = len(data)
    i = 0
    Dict = {}
    while i < lda.num_topics:
        j = 0
        sm = 0
        for ind in data.index:
            a = lda.get_document_topics( corpusbow[j] )
            b = [item for item in a if item[0] == i]
            if len(b) == 0:
                sm += 0
            elif len(b) > 1:
                sm += b[0][1]
                print("ohh no")
            else:
                sm += b[0][1]
            j += 1    
        sm = sm/k
        Dict[i] = sm
        i += 1
        
            
    return Dict
overall_topics = overall_topic_distribution(data, lda, voc)
overall_topics

{0: 0.017821197067943998,
 1: 0.026913130198698353,
 2: 0.08070328262044997,
 3: 0.019322453295987006,
 4: 0.027010779502809387,
 5: 0.02116953704824082,
 6: 0.07979075324369353,
 7: 0.050218198942025116,
 8: 0.02881852178607618,
 9: 0.021826257970893232,
 10: 0.03025083334574112,
 11: 0.03937601166939023,
 12: 0.01490110794377302,
 13: 0.03559428495155676,
 14: 0.10884855093894388,
 15: 0.05872894534099054,
 16: 0.033792859956895155,
 17: 0.2348402797439641,
 18: 0.021771096754764182,
 19: 0.03973472668925384}

In [11]:
yearly_topics = { 'Total': list(overall_topics.values()) }
for year in sorted(data['YEAR'].unique()):
    yearly_topics[str(year)] = list( yearly_topic_distribution(data, lda, voc, year).values() )
results = pandas.DataFrame(data=yearly_topics).rename( index=topic_labels )
results

Unnamed: 0,Total,1981,1982,1983,1985,1986,1987,1988,1989,1990,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
0: user direct technique manipulation combination control point drawing edit alternative,0.017821,0.028995,0.029791,0.029347,0.032474,0.032253,0.038121,0.0377,0.029334,0.024742,...,0.01409,0.019578,0.018024,0.016841,0.014388,0.012542,0.014453,0.013217,0.012808,0.012872
1: model eye pointing task device movement law input gaze fitts,0.026913,0.011361,0.015779,0.01646,0.039749,0.01644,0.040509,0.021396,0.023806,0.050912,...,0.038985,0.024488,0.036701,0.030667,0.022928,0.02607,0.024838,0.019683,0.022925,0.022395
2: system user computer language human error use behavior problem usability,0.080703,0.179329,0.251011,0.247499,0.217451,0.220887,0.129566,0.180947,0.161653,0.183297,...,0.069058,0.066008,0.065969,0.061561,0.050566,0.061655,0.052192,0.055951,0.049292,0.047576
3: voice usability user whisper labeling trace visualization technique annotation problem,0.019322,0.016036,0.012657,0.01766,0.01482,0.026191,0.017251,0.019865,0.025158,0.020737,...,0.028931,0.029943,0.018693,0.016106,0.02015,0.017571,0.015735,0.017681,0.015308,0.019451
4: object physical interaction user computer interface interactive application tool system,0.027011,0.023605,0.020385,0.029273,0.022223,0.025689,0.018467,0.031686,0.03564,0.023491,...,0.025704,0.031144,0.028772,0.021595,0.025161,0.018795,0.021981,0.024626,0.02316,0.02359
5: virtual focus space 3d interface environment design user navigation view,0.02117,0.007874,0.006013,0.012197,0.008508,0.008391,0.013362,0.024508,0.024881,0.010515,...,0.016789,0.019289,0.017568,0.020468,0.019542,0.02318,0.019469,0.02257,0.018278,0.020039
6: display technique study screen target interaction image present provide visual,0.079791,0.051794,0.03818,0.073331,0.069458,0.059668,0.094265,0.074544,0.055352,0.053479,...,0.097069,0.090238,0.088269,0.095269,0.084952,0.068186,0.08323,0.084911,0.082791,0.083692
7: web user search information site page tool task datum system,0.050218,0.07625,0.037414,0.042798,0.03462,0.044306,0.042613,0.062445,0.039935,0.020804,...,0.039305,0.062929,0.053236,0.06844,0.06176,0.050122,0.045481,0.049391,0.038121,0.041882
8: device touch gesture input interaction hand user sensor finger technique,0.028819,0.004128,0.006134,0.00382,0.017896,0.016126,0.01157,0.008365,0.016688,0.013184,...,0.020446,0.02387,0.02899,0.026927,0.023162,0.026206,0.039139,0.038917,0.043147,0.043931
9: child interactive computer support study blind design work base present,0.021826,0.028779,0.033907,0.026549,0.016021,0.036679,0.02851,0.029946,0.024908,0.026574,...,0.023817,0.013377,0.017394,0.021802,0.018348,0.023178,0.021341,0.023892,0.017207,0.021011


In [12]:
# Then, let's sort the topics by their overall distribution (the 'Total' column)
results.sort_values(by=['Total'], ascending=False)

Unnamed: 0,Total,1981,1982,1983,1985,1986,1987,1988,1989,1990,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
17: design social study technology paper research people use interaction activity,0.23484,0.236423,0.177899,0.137802,0.142281,0.135478,0.18716,0.131294,0.14979,0.197271,...,0.222582,0.199769,0.218918,0.236515,0.27127,0.270272,0.278658,0.258245,0.300682,0.273909
14: user task result study participant performance time experiment effect find,0.108849,0.08653,0.116237,0.165163,0.130173,0.122202,0.152371,0.117824,0.107342,0.07184,...,0.128787,0.116979,0.126876,0.105741,0.097439,0.116417,0.107003,0.108926,0.106009,0.111505
2: system user computer language human error use behavior problem usability,0.080703,0.179329,0.251011,0.247499,0.217451,0.220887,0.129566,0.180947,0.161653,0.183297,...,0.069058,0.066008,0.065969,0.061561,0.050566,0.061655,0.052192,0.055951,0.049292,0.047576
6: display technique study screen target interaction image present provide visual,0.079791,0.051794,0.03818,0.073331,0.069458,0.059668,0.094265,0.074544,0.055352,0.053479,...,0.097069,0.090238,0.088269,0.095269,0.084952,0.068186,0.08323,0.084911,0.082791,0.083692
15: mobile user phone feedback study base use location application photo,0.058729,0.0217,0.021576,0.011424,0.021715,0.018696,0.008166,0.013789,0.018828,0.016905,...,0.067452,0.06942,0.060991,0.069449,0.071195,0.071161,0.075524,0.074494,0.069649,0.072908
7: web user search information site page tool task datum system,0.050218,0.07625,0.037414,0.042798,0.03462,0.044306,0.042613,0.062445,0.039935,0.020804,...,0.039305,0.062929,0.053236,0.06844,0.06176,0.050122,0.045481,0.049391,0.038121,0.041882
19: interface design user application base system interaction tangible context computer,0.039735,0.036455,0.059196,0.041903,0.056792,0.089134,0.087243,0.084621,0.107802,0.105145,...,0.030298,0.040368,0.035362,0.025988,0.030166,0.024602,0.028609,0.028975,0.025763,0.025428
11: design process note child new technology group support work system,0.039376,0.034277,0.041751,0.028848,0.026891,0.026591,0.040827,0.024428,0.032386,0.049496,...,0.044665,0.033571,0.030915,0.038175,0.039495,0.042225,0.036882,0.041344,0.042759,0.03655
13: user menu application task time visual system performance design model,0.035594,0.042448,0.04507,0.038319,0.044855,0.043581,0.026339,0.069624,0.046475,0.06055,...,0.034014,0.041923,0.03836,0.043062,0.032928,0.028266,0.023867,0.027282,0.027634,0.030389
16: system communication video support collaboration speaker work people remote native,0.033793,0.008841,0.018878,0.016786,0.02529,0.010377,0.020382,0.01302,0.039804,0.022559,...,0.047489,0.042363,0.045775,0.032192,0.039874,0.033209,0.036793,0.030989,0.026764,0.030098


In [13]:
results.loc[results['Total'] > 0.05]

Unnamed: 0,Total,1981,1982,1983,1985,1986,1987,1988,1989,1990,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
2: system user computer language human error use behavior problem usability,0.080703,0.179329,0.251011,0.247499,0.217451,0.220887,0.129566,0.180947,0.161653,0.183297,...,0.069058,0.066008,0.065969,0.061561,0.050566,0.061655,0.052192,0.055951,0.049292,0.047576
6: display technique study screen target interaction image present provide visual,0.079791,0.051794,0.03818,0.073331,0.069458,0.059668,0.094265,0.074544,0.055352,0.053479,...,0.097069,0.090238,0.088269,0.095269,0.084952,0.068186,0.08323,0.084911,0.082791,0.083692
7: web user search information site page tool task datum system,0.050218,0.07625,0.037414,0.042798,0.03462,0.044306,0.042613,0.062445,0.039935,0.020804,...,0.039305,0.062929,0.053236,0.06844,0.06176,0.050122,0.045481,0.049391,0.038121,0.041882
14: user task result study participant performance time experiment effect find,0.108849,0.08653,0.116237,0.165163,0.130173,0.122202,0.152371,0.117824,0.107342,0.07184,...,0.128787,0.116979,0.126876,0.105741,0.097439,0.116417,0.107003,0.108926,0.106009,0.111505
15: mobile user phone feedback study base use location application photo,0.058729,0.0217,0.021576,0.011424,0.021715,0.018696,0.008166,0.013789,0.018828,0.016905,...,0.067452,0.06942,0.060991,0.069449,0.071195,0.071161,0.075524,0.074494,0.069649,0.072908
17: design social study technology paper research people use interaction activity,0.23484,0.236423,0.177899,0.137802,0.142281,0.135478,0.18716,0.131294,0.14979,0.197271,...,0.222582,0.199769,0.218918,0.236515,0.27127,0.270272,0.278658,0.258245,0.300682,0.273909


In [None]:
results[results['Total'] > 0.05].drop(columns=['Total']).T.plot.line( figsize=(15,10) )