<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Laden-aller-Datensätze" data-toc-modified-id="Laden-aller-Datensätze-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Laden aller Datensätze</a></span></li><li><span><a href="#Pipeline-zur-Exploration-von-der-verschiedenen-Versionen-des-Datensatzes-zur-Gegenüberstellung" data-toc-modified-id="Pipeline-zur-Exploration-von-der-verschiedenen-Versionen-des-Datensatzes-zur-Gegenüberstellung-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pipeline zur Exploration von der verschiedenen Versionen des Datensatzes zur Gegenüberstellung</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np

import string
import six
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

#### Laden aller Datensätze

In [2]:
happy_all = pd.read_csv('happy_preprocessed_onlygroundtruth.csv')
happy_day = pd.read_csv('happy_preprocessed_24h.csv')
happy_months = pd.read_csv('happy_preprocessed_3m.csv')
age_all = pd.read_csv('happy_combined_by_age_all.csv')
age_day = pd.read_csv('happy_combined_by_age_day.csv')
age_months = pd.read_csv('happy_combined_by_age_months.csv')
wid_all = pd.read_csv('happy_combined_by_wid_all.csv')
wid_day = pd.read_csv('happy_combined_by_wid_day.csv')
wid_months = pd.read_csv('happy_combined_by_wid_months.csv')

In [232]:
print(len(happy_all[happy_all.age_range=='20-29'][happy_all.country=='IND'][happy_all.marital=='married'])/len(happy_all[happy_all.marital=='married'][happy_all.country=='IND']))
print(len(happy_all[happy_all.age_range=='30-39'][happy_all.country=='IND'][happy_all.marital=='married'])/len(happy_all[happy_all.marital=='married'][happy_all.country=='IND']))
print(len(happy_all[happy_all.age_range=='20-29'][happy_all.country=='USA'][happy_all.marital=='married'])/len(happy_all[happy_all.marital=='married'][happy_all.country=='USA']))
print(len(happy_all[happy_all.age_range=='30-39'][happy_all.country=='USA'][happy_all.marital=='married'])/len(happy_all[happy_all.marital=='married'][happy_all.country=='USA']))
print(len(happy_all[happy_all.age_range=='30-39'][happy_all.marital=='single'])/len(happy_all[happy_all.marital=='single']))

0.5002985074626866
0.39164179104477614
0.23691790686509842
0.44791166586653863
0.24723346828609988


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


#### Pipeline zur Exploration von der verschiedenen Versionen des Datensatzes zur Gegenüberstellung

In [29]:
class Visualisations:
    def __init__(self, df, ds_name, path, column):
        self.df = df
        self.ds_name = ds_name
        self.path = path
        self.column = column
     
    # statistische Übersicht
    def prepare_df(self):
        df = self.df[self.df[self.column].notnull()]
        len_count = df[self.column].apply(lambda x: len(x.split()))
        df = pd.DataFrame(len_count.describe())
        df['statistics'] = df.index
        df = df[['statistics',self.column]]
        df.rename(columns={self.column:'values'}, inplace=True)
        df = df.round({'values': 1})
        return df, len_count

    def render_mpl_table(self, data,path, col_width=3.0, row_height=0.625, font_size=14,
                         header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                         bbox=[0, 0, 1, 1], header_columns=0,
                         ax=None, **kwargs):
        if ax is None:
            size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
            fig, ax = plt.subplots(figsize=size)
            ax.axis('off')

        mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

        mpl_table.auto_set_font_size(False)
        mpl_table.set_fontsize(font_size)

        for k, cell in six.iteritems(mpl_table._cells):
            cell.set_edgecolor(edge_color)
            if k[0] == 0 or k[1] < header_columns:
                cell.set_text_props(weight='bold', color='w')
                cell.set_facecolor(header_color)
            else:
                cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
        plt.savefig(path)
        plt.clf()
        return ax
    
    # Satzlängen
    def compute_length(self, len_count):
        length_order = ["0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", \
                        "40-44", "45-49", ">=50"]
        length_category = len_count.apply(lambda x: length_order[min(10, int(x/5))])
        length_counts = pd.DataFrame(length_category.value_counts()).reset_index()
        length_counts.columns = ['number of words per moment', 'number of moments']
        
        plt.figure(figsize=(10,6))
        sns.barplot(x='number of words per moment', y='number of moments', data=length_counts, order=length_order)
        plt.savefig(self.path + self.ds_name + '_moment_len.png')
        plt.clf()
    
    def func(self,pct, allvals):
        absolute = int(pct/100.*np.sum(allvals))
        return "{:.1f}% \n{:d} moments".format(pct, absolute)
    
    # Anzahl Momente pro Kategorie/ Altersstufe
    def moments_per_category(self):
        if 'ground_truth_category' in self.df.columns:
            explode = (0.05,0.05,0.05,0.05,0.05,0.05,0.05)
            c = Counter(self.df.ground_truth_category)

            plt.figure( figsize=(12,8) )
            plt.pie(list(c.values()), labels=list(c.keys()), autopct=lambda pct: self.func(pct, list(c.values())), pctdistance=1.2,labeldistance=0.45,explode=explode)
            plt.tight_layout()
            plt.savefig(self.path+self.ds_name+'_moments_per_cat.png', bbox_inches = "tight")
            plt.clf()

    def moments_per_age(self):
#         if 'age' in self.df.columns:
#             c = Counter(self.df.age)
#             plt.pie(list(c.values()), labels=list(c.keys()), autopct=lambda pct: self.func(pct, list(c.values())),pctdistance=1.2,labeldistance=0.45)
#             plt.tight_layout()
#             plt.savefig(self.path+self.ds_name+'_moments_per_age.png', bbox_inches = "tight")
        if 'age_range' in self.df.columns:
            c = Counter(self.df.age_range)
            plt.pie(list(c.values()), labels=list(c.keys()), autopct=lambda pct: self.func(pct, list(c.values())),pctdistance=1.2,labeldistance=0.45)
            plt.tight_layout()
            plt.savefig(self.path+self.ds_name+'_moments_per_agerange.png', bbox_inches = "tight")
        plt.clf()
    
    # Länge pro Kategorie
    def length_per_category(self):
        if 'ground_truth_category' in self.df.columns:
            plt.figure(figsize=(10,6))
            sns.boxplot(x=self.df.ground_truth_category, y=self.df.length)
            plt.savefig(self.path+self.ds_name+'_length_per_cat.png')
            plt.clf()
            
     # Länge pro Altersstufe
    def length_per_age(self):
        if 'age_range' in self.df.columns:
            plt.figure(figsize=(10,6))
            sns.boxplot(x=self.df.age_range, y=self.df.length)
            plt.savefig(self.path+self.ds_name+'_length_per_age.png')
            plt.clf()
    
    # Wordclouds
    def wordclouds(self):
        if 'cleaned_hm' in self.df.columns:
            text = ' '.join(self.df['cleaned_hm'].tolist())
            text = text.lower()
            wordcloud = WordCloud(background_color="white", height=2700, width=3600).generate(text)
            plt.figure(figsize=(14,8))
            plt.imshow(wordcloud.recolor(colormap=plt.get_cmap('Set2')), interpolation='bilinear')
            plt.axis("off")
            if self.path is not None:
                plt.savefig(self.path+self.ds_name+'_wordcloud.png')
            plt.clf()
    
    def wordclouds_limited(self):
        LIMIT_WORDS = ['happy', 'day', 'got', 'went', 'today', 'made', 'one', 'two', 'time', 'last', 'first', 'going', 'getting', 'took', 'found', 'lot', 'really', 'saw', 'see', 'month', 'week', 'day', 'yesterday', 'year', 'ago', 'now', 'still', 'since', 'something', 'great', 'good', 'long', 'thing', 'toi', 'without', 'yesteri', '2s', 'toand', 'ing']
        if 'cleaned_hm' in self.df.columns:
            text = ' '.join(self.df['cleaned_hm'].tolist())
            text = text.lower()
            for w in LIMIT_WORDS:
                text = text.replace(' ' + w, '')
                text = text.replace(w + ' ', '')
            wordcloud = WordCloud(background_color="white", height=2700, width=3600).generate(text)
            plt.figure(figsize=(14,8))
            plt.imshow(wordcloud.recolor(colormap=plt.get_cmap('Set2')), interpolation='bilinear')
            plt.axis("off")
            plt.savefig(self.path+self.ds_name+'_wordcloud_limited.png')
            plt.clf()
        

    def gender(self):
        if 'gender' in self.df.columns:
            self.df.gender.value_counts().plot(kind='bar')
            plt.savefig(self.path+self.ds_name+'_gender.png')
            plt.clf()
        
    def marital(self):
        if 'hmid' in self.df.columns:
            df = pd.pivot_table(self.df, values='hmid',index='marital', aggfunc='count')
            df['marital'] = df.index
            df = df[['marital','hmid']]
            df.rename(columns={'hmid':'values'}, inplace=True)
            self.render_mpl_table(data=df, path=self.path+self.ds_name+'_marital.png')
        
    def country(self):
        if 'hmid' in self.df.columns:
            df = pd.pivot_table(self.df, values='hmid',index='country', aggfunc='count')
            df['country'] = df.index
            df = df[['country','hmid']]
            df.rename(columns={'hmid':'values'}, inplace=True)
            df.sort_values(by='values', inplace=True, ascending=False)
            df = df[:10]
            self.render_mpl_table(data=df, path=self.path+self.ds_name+'_country.png')

    def parenthood(self):
        if 'hmid' in self.df.columns:
            df = pd.pivot_table(self.df, values='hmid',index='parenthood', aggfunc='count')
            df['parenthood'] = df.index
            df = df[['parenthood','hmid']]
            df.rename(columns={'hmid':'values'}, inplace=True)
            self.render_mpl_table(data=df, path=self.path+self.ds_name+'_parenthood.png')
        
    # prediction und ground_truth nicht immer übereinstimmend
    # Number of moments per category
    def categories(self):    
        if 'hmid' in self.df.columns:
            df = pd.pivot_table(self.df, values='hmid',index='ground_truth_category', aggfunc='count')
            df['category'] = df.index
            df = df[['category','hmid']]
            df.rename(columns={'hmid':'values'}, inplace=True)
            self.render_mpl_table(df, self.path+self.ds_name+'_mom_per_cat.png')
        
    def age(self):
        if 'age' in self.df.columns:
            sns.distplot(self.df['age'], kde=False)
            plt.savefig(self.path+self.ds_name+'_dist_age.png')
        elif 'age_range' in self.df.columns:
            c=Counter(self.df['age_range'])
            sns.barplot(x=list(c.keys()), y=list(c.values()))
            plt.savefig(self.path+self.ds_name+'_dist_agerange.png')
        plt.clf()
        
    def vis_marital(self):
        if 'ground_truth_category' in self.df.columns and 'marital' in self.df.columns:
            marital = self.df[self.df!='divorced']
            marital = marital[marital!='widowed']
            marital = marital[marital!='separated']
            g = sns.FacetGrid(data=marital, col='marital', height=5, aspect=1.5,)

            g.map(plt.hist,'ground_truth_category', bins=50)
            plt.savefig(self.path+self.ds_name+'_married_vs_single.png')
            plt.clf()
        
    def all_vis(self):
        len_df, len_count = self.prepare_df()
        self.render_mpl_table(data=len_df, header_columns=0, col_width=2.0, path=self.path + self.ds_name + '_statistics.png')
        self.compute_length(len_count)
        self.moments_per_category()
        self.moments_per_age()
        self.length_per_category()
        self.length_per_age()
        self.wordclouds()
        self.wordclouds_limited()
        self.gender()
        self.marital()
        self.country()
        self.parenthood()
        self.categories()
        self.age()
        self.vis_marital()

In [30]:
dataset_dic = {'happy_all':happy_all, 'happy_day':happy_day, 'happy_months':happy_months, 
               'age_all':age_all, 'age_day':age_day, 'age_months':age_months, 
               'wid_all':wid_all,'wid_day':wid_day, 'wid_months':wid_months}
               
               
for k,v in dataset_dic.items():
    vis = Visualisations(v, k, 'visualisations/', 'stemmed')
    print(vis.all_vis())

  res_values = method(rvalues)


None


  res_values = method(rvalues)
  fig, axes = plt.subplots(nrow, ncol, **kwargs)


None


  res_values = method(rvalues)
  fig, axes = plt.subplots(nrow, ncol, **kwargs)


None




None




None




None




None




None




None


<Figure size 288x405 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 432x315 with 0 Axes>

<Figure size 432x495 with 0 Axes>

<Figure size 432x180 with 0 Axes>

<Figure size 432x360 with 0 Axes>

<Figure size 1620x360 with 0 Axes>

<Figure size 288x405 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 432x315 with 0 Axes>

<Figure size 432x495 with 0 Axes>

<Figure size 432x180 with 0 Axes>

<Figure size 432x360 with 0 Axes>

<Figure size 1620x360 with 0 Axes>

<Figure size 288x405 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 432x315 with 0 Axes>

<Figure size 432x495 with 0 Axes>

<Figure size 432x180 with 0 Axes>

<Figure size 432x360 with 0 Axes>

<Figure size 1620x360 with 0 Axes>

<Figure size 288x405 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 288x405 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 288x405 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 288x405 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 288x405 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 288x405 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>