In [1]:
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile

import nltk
from nltk.cluster import KMeansClusterer
nltk.download('wordnet')

import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
 
from sklearn.model_selection import train_test_split 
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package wordnet to /home/teresa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Daten importieren und vorbereiten

In [2]:
happy = pd.read_csv('happy_preprocessed_onlygroundtruth.csv')
happy.head(1)

Unnamed: 0.1,Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category,...,stemmed,preprocessed,age,country,gender,marital,parenthood,age_range,relation_age,topic
0,6,40281,2053,24h,I played a game for about half an hour.,I played a game for about half an hour.,True,1,leisure,leisure,...,I play a game for about half an hour .,"['i', 'play', 'a', 'game', 'for', 'about', 'ha...",35,USA,m,single,n,30-39,single_30-39,0


In [38]:
import string
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
    # remove commonly occurring words
    LIMIT_WORDS = ['happy', 'day', 'got', 'went', 'today', 'made', 'one', 'two', 'time', 'last', 'first', 'going', 'getting', 'took', 'found', 'lot', 'really', 'saw', 'see', 'month', 'week', 'day', 'yesterday', 'year', 'ago', 'now', 'still', 'since', 'something', 'great', 'good', 'long', 'thing', 'toi', 'without', 'yesteri', '2s', 'toand', 'ing']
    nopunc = [char for char in mess if char not in LIMIT_WORDS]
        
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)   
    # Now just remove any stopwords

    
    return [word.lower() for word in nopunc.split()]
# Apply to entire happy dataset, column cleaned_hm
happy['preprocessed'] = happy['stemmed'].apply(text_process)
happy.head(1)

Unnamed: 0.1,Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category,length,stemmed,preprocessed,age,country,gender,marital,parenthood,age_range,topic
0,6,40281,2053,24h,I played a game for about half an hour.,I played a game for about half an hour.,True,1,leisure,leisure,9,I play a game for about half an hour .,"[i, play, a, game, for, about, half, an, hour, .]",35,USA,m,single,n,30-39,0


In [39]:
# happy_datasplit['stemmed_split'] = ''
# for i, row in happy_datasplit.iterrows():
#     happy_datasplit.at[i, 'stemmed_split'] = row.stemmed.split()
# happy_datasplit

In [40]:
cols = ['ground_truth_category', 'preprocessed']
happy_datasplit = happy[cols]
# for i, row in happy_datasplit.iterrows():
#     print(' '.join(map(str, list(row.preprocessed))))
#     row.preprocessed = ' '.join(row.preprocessed)
happy_datasplit

Unnamed: 0,ground_truth_category,preprocessed
0,leisure,"[i, play, a, game, for, about, half, an, hour, .]"
1,affection,"[when, my, famili, plan, a, abroad, tour, with..."
2,affection,"[when, my, hous, readi, to, live, with, my, fa..."
3,bonding,"[when, my, friend, meet, me, today, with, expe..."
4,affection,"[i, wa, veri, happi, when, my, son, play, with..."
...,...,...
14078,achievement,"[my, tooth, stop, ach, after, my, dentist, vis..."
14079,affection,"[i, took, a, bath, with, my, husband, .]"
14080,achievement,"[i, got, on, the, scale, in, the, morn, and, i..."
14081,affection,"[quit, dinner, with, my, wife, .]"


In [43]:
train_dict = happy_datasplit.groupby('ground_truth_category')['preprocessed'].apply(list).to_dict()
# test_dict = test.groupby('Genre1')['stemmed_split'].apply(list).to_dict()

In [44]:
train_tagged = {} # Contains clean tagged training data organized by category. To be used for the training corpus.
# test_clean = {} # Contains clean un-tagged training data organized by category.
# test_tagged = {}

offset = 0 # Used for managing IDs of tagged documents
for k, v in train_dict.items():
    train_tagged[k] = [gensim.models.doc2vec.TaggedDocument(doc, [i+offset]) for i, doc in enumerate(v)]
    offset += len(v)
    
# Eventually contains final versions of the training data to actually train the model
train_corpus = [taggeddoc for taggeddoc_list in list(train_tagged.values()) for taggeddoc in taggeddoc_list]
# test_corpus = [taggeddoc for taggeddoc_list in list(test_tagged.values()) for taggeddoc in taggeddoc_list]

### Modell vorbereiten und trainieren

https://tmthyjames.github.io/2018/january/Analyzing-Rap-Lyrics-Using-Word-Vectors/

#### Parameter-Finetuning

In [45]:
model = Doc2Vec(size=100,      # Songlängen zwischen 1000 und 5000 Wörtern
                min_count=2,
                window=5,
                epochs=40,
                dm=1)

model.build_vocab(train_corpus)
model.train(train_corpus,
            total_examples=model.corpus_count,
                epochs=model.epochs)

model.save("d2v.model")
print("Model Saved")



Model Saved


In [46]:
# Modell mit Daten ohne Rock und Pop

model=Doc2Vec.load("d2v.model")
print(len(model.docvecs))
# model.wv.vocab

14083


In [47]:
model.wv.vocab

{'when': <gensim.models.keyedvectors.Vocab at 0x7f46f0035050>,
 'i': <gensim.models.keyedvectors.Vocab at 0x7f46f0035f10>,
 'shift': <gensim.models.keyedvectors.Vocab at 0x7f46f0035610>,
 'my': <gensim.models.keyedvectors.Vocab at 0x7f46f0035790>,
 'new': <gensim.models.keyedvectors.Vocab at 0x7f46f0035e90>,
 'home': <gensim.models.keyedvectors.Vocab at 0x7f46f0035c50>,
 'bought': <gensim.models.keyedvectors.Vocab at 0x7f46f0035150>,
 'wed': <gensim.models.keyedvectors.Vocab at 0x7f46f0035110>,
 'shoe': <gensim.models.keyedvectors.Vocab at 0x7f46fc849390>,
 '!': <gensim.models.keyedvectors.Vocab at 0x7f46f0053690>,
 'watch': <gensim.models.keyedvectors.Vocab at 0x7f46f0053ad0>,
 'arriv': <gensim.models.keyedvectors.Vocab at 0x7f46f0053d50>,
 'in': <gensim.models.keyedvectors.Vocab at 0x7f46f0053990>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7f46f00535d0>,
 'mail': <gensim.models.keyedvectors.Vocab at 0x7f46f0053510>,
 'start': <gensim.models.keyedvectors.Vocab at 0x7f46f0053cd0>,

### Modell evaluieren

#### "scharfes Hinsehen"

In [48]:
model.wv.most_similar('love')

[('like', 0.5175316333770752),
 ('childhood', 0.4417549669742584),
 ('cute', 0.40941429138183594),
 ('miss', 0.4085070490837097),
 ('excit', 0.4067125916481018),
 ('know', 0.4039749503135681),
 ('terrif', 0.397244393825531),
 ('sweet', 0.3965001404285431),
 ('young', 0.39443838596343994),
 ('never', 0.38973623514175415)]

In [50]:
model.wv.most_similar('brother')

[('cousin', 0.7185249924659729),
 ('sister', 0.605430006980896),
 ('father', 0.6039519309997559),
 ('dad', 0.5944823622703552),
 ('nephew', 0.5799951553344727),
 ('mom', 0.5754165649414062),
 ('son', 0.5563855171203613),
 ('girlfriend', 0.5534058809280396),
 ('friend', 0.5523203015327454),
 ('mother', 0.5451760292053223)]

In [52]:
model.wv.most_similar('famili')

[('girlfriend', 0.5021774172782898),
 ('weekend', 0.49192479252815247),
 ('time', 0.4840264320373535),
 ('wife', 0.47207337617874146),
 ('rel', 0.4620027542114258),
 ('parti', 0.46109408140182495),
 ('husband', 0.4572964608669281),
 ('cousin', 0.449217826128006),
 ('sister', 0.433441698551178),
 ('partner', 0.4314582645893097)]

In [60]:
# Compute ranks of similar documents
ranks = []
second_ranks = []
print(len(model.docvecs))
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=(len(model.docvecs)))

    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

14083


In [61]:
counter = Counter(ranks)
print(counter)

Counter({0: 11987, 1: 437, 2: 220, 3: 120, 4: 87, 5: 74, 6: 56, 7: 51, 8: 48, 10: 35, 11: 25, 9: 25, 14: 22, 18: 19, 13: 17, 20: 17, 12: 17, 17: 17, 24: 15, 16: 14, 27: 14, 31: 14, 19: 13, 15: 12, 21: 11, 26: 11, 55: 10, 28: 10, 39: 10, 29: 10, 30: 9, 25: 9, 44: 9, 32: 9, 56: 9, 33: 9, 36: 9, 34: 9, 23: 8, 54: 8, 22: 8, 41: 8, 37: 7, 61: 7, 50: 7, 42: 6, 53: 6, 43: 6, 67: 6, 52: 6, 35: 6, 84: 6, 64: 5, 70: 5, 99: 5, 71: 5, 62: 5, 46: 5, 45: 5, 98: 5, 168: 4, 60: 4, 58: 4, 94: 4, 109: 4, 195: 4, 144: 4, 130: 4, 139: 4, 119: 4, 48: 4, 77: 4, 75: 4, 88: 4, 117: 4, 76: 3, 90: 3, 59: 3, 66: 3, 105: 3, 151: 3, 57: 3, 163: 3, 91: 3, 65: 3, 72: 3, 165: 3, 145: 3, 40: 3, 38: 3, 92: 3, 69: 3, 89: 3, 102: 3, 184: 3, 82: 3, 125: 3, 150: 3, 170: 3, 95: 3, 78: 3, 253: 3, 51: 3, 103: 3, 344: 3, 296: 3, 187: 3, 68: 3, 133: 3, 80: 3, 79: 3, 113: 3, 106: 3, 202: 2, 124: 2, 300: 2, 222: 2, 204: 2, 179: 2, 406: 2, 192: 2, 47: 2, 104: 2, 116: 2, 85: 2, 132: 2, 211: 2, 256: 2, 122: 2, 73: 2, 352: 2, 161: 2,

In [62]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))


Document (14082): «last night my roommat came home from work and told me he 's up for go to the mountain for the weekend to hike and explor . i live for be outdoor with natur . i 'll be excit until it 's time to come home .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w5,mc2,s0.001,t3):

MOST (14082, 0.8840101957321167): «last night my roommat came home from work and told me he 's up for go to the mountain for the weekend to hike and explor . i live for be outdoor with natur . i 'll be excit until it 's time to come home .»

SECOND-MOST (11119, 0.6069707274436951): «i think it 's time to be happi again .»

MEDIAN (7445, 0.10064682364463806): «i met a girl and end up go on a date with her to the mall in town and after we went to eat at a nice restaur .»

LEAST (3598, -0.3655965328216553): «i sign a contract with my first photographi client that wa not a rel or friend .»



In [67]:
import random
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)
inferred_vector = model.infer_vector(train_corpus[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))



Test Document (4829): «my sister had success gall bladder surgeri with no issu .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w5,mc2,s0.001,t3):

MOST (4829, 0.8424397706985474): «my sister had success gall bladder surgeri with no issu .»

MEDIAN (12947, 0.4293534457683563): «there wa a new episod of my current favorit show .»

LEAST (3217, -0.2881014943122864): «i fix my older laptop and made it function again .»



#### Berechnung der Ähnlichkeit von Dokumenten desselben Genres

In [68]:
def get_doc_index(sims, train_tagged):
    result = []
    for sim in sims:
        for k,v in train_tagged.items():
            for i in range(len(v)):
                if v[i][1][0] == sim[0]:
                    result.append((k, v[i][0],sim[1]))
    return result

In [70]:
def assign_genre_combination(df):
    for i, row in df.iterrows():
        genre = df.iloc[0,7]
            # print((df.iloc[i,6]))
        df.at[i, 'genre_comb'] = str(genre)+'_'+str(df.iloc[i,7])
    return df

In [3]:
def most_similar_doc(songs, sims, train_tagged):
    match = []
    similarity = []
    for i, row in happy.iterrows():
        for res in get_doc_index(sims, train_tagged):
            if res[1]==row.stemmed:
                match.append(row)
                similarity.append(res[2])
    match_df = pd.DataFrame(match)
    match_df['similarity'] = similarity
    match_df.sort_values(by=['similarity'], inplace=True, ascending=False)
    match_df = match_df.reset_index()
    # print(match_df)
    match_df = assign_genre_combination(match_df)

    return match_df

# corpus = train_corpus + test_corpus

# match_df = most_similar_doc(happy, sims, train_tagged)
# match_df