In [2]:
import platform
import tensorflow as tf
from tensorflow.python.client import device_lib

if tf.test.is_built_with_cuda() == False:
    print('ARBEIT OHNE GPU')
else:
    device_lib.list_local_devices()

print(platform.machine(), '\n')
print(platform.version(), '\n')
print(platform.platform(), '\n')
print(platform.uname(), '\n')
print(platform.system(), '\n')
print(platform.processor(),)

AMD64 

10.0.19041 

Windows-10-10.0.19041-SP0 

uname_result(system='Windows', node='DESKTOP-FAKSDIV', release='10', version='10.0.19041', machine='AMD64', processor='AMD64 Family 23 Model 1 Stepping 1, AuthenticAMD') 

Windows 

AMD64 Family 23 Model 1 Stepping 1, AuthenticAMD


### Consider!: Since the data is sensitive, I only can post markdowns and not code and data itself.

### df_content
contains all information collected in the first notebook. It mainly consists of content related information. Text, topic, publisher, etc.

### df_pageviews
contains all time related informations collected in BigQuery. We filtered all time informations which could not fit the following requirements:
- the articles have atleast a 3 months lifetime (based on published time)

views_for_certain_day contains all pageviews for the days past since the publication of the article.


![fig4](https://raw.githubusercontent.com/OweysMomenzada/Evergreen-Content-Classifier-for-german-Text/main/EDA/images/fig4.png)

![fig5](https://raw.githubusercontent.com/OweysMomenzada/Evergreen-Content-Classifier-for-german-Text/main/EDA/images/fig5.png)

![fig6_5](https://raw.githubusercontent.com/OweysMomenzada/Evergreen-Content-Classifier-for-german-Text/main/EDA/images/fig6_5.png)

## The Statistic Normalization Function

Since some publishers and articles are more famous than others and also that evergreens have a high decrease in views in the first days, we are normalizing based on the given days.

Thus, we can highlight the behavior in the first days and differ between evergreens and non evergreens. Note that normalization function such as Softmax or Symmetric normalization ruin structures of the evergreen article due to the high amount of nonevergreen articles

We take a normalization function. This function takes the first x days to calculate its average and than normalize the past days  based on that.


- #### days and average_of_days can be used as hyper parameters for training

In [15]:
# generiert die Vektorlänge bzw. die Anzahl der Tage
def hidden_dims(days:int, df:pd.DataFrame):
    try:
        df['days_past']
        df['article_drive_id']
        df['views_for_certain_day']
    except:
        print("The given Dataset has no column named 'days_past', 'article_drive_id' or 'views_for_certain_day'")
        raise
    return df.loc[df.days_past < days]

# take the average and normalize the past days based on that average
def normalizer(normalize_value:int, past_days:list):
    # starting with the first day by 1.0
    normalized_list = [(normalize_value/normalize_value)]
    #normalize all values in the list based on the average
    past_days_normalized = [x / normalize_value for x in past_days]   
    normalized_list = normalized_list+past_days_normalized
    days_past = list(range(0,len(normalized_list)))
    return normalized_list, days_past

# here we generate the average on the given days
def normalize_views(days:int, average_of_days:int, df:pd.DataFrame):
    
    # throw exception if averagedays are bigger than days
    if days <= average_of_days:
        raise Exception('the given values are invalid')
        
    # get the relevant days we need to consider
    relevant_days = days+average_of_days
    
    # get the relevant rows and just work with that
    df = hidden_dims((relevant_days), df)
    df_av = pd.DataFrame({'article_drive_id':[], 
                          'views_for_certain_day_normalized':[],
                          'meta.publisher':[]})
    days_past_list = []
    unique_drive_id = list(set(df['article_drive_id']))
    for i in range(len(unique_drive_id)):
        #get all the information of the i.- article
        i_article = df.loc[df.article_drive_id == unique_drive_id[i]]

        #filter all irrelevant days out
        i_article = i_article.loc[i_article.days_past < (average_of_days+days)]
        
        #calculate the views of all given days in average_days
        average_views = i_article.loc[i_article.days_past < average_of_days].views_for_certain_day.mean()
        #get all days after the average calculated days
        past_average_views = i_article.loc[i_article.days_past >= average_of_days].views_for_certain_day
        
        #normalize the values
        normalized_list, days_past = normalizer(average_views, list(past_average_views))
        days_past_list.extend(days_past)
        
        #get publisher of corresponding article
        publisher = list(df.loc[df['article_drive_id'] == unique_drive_id[i]]['meta.publisher'])[0]
        df_av = df_av.append({'views_for_certain_day_normalized': normalized_list,
                             'article_drive_id': unique_drive_id[i],
                             'meta.publisher': publisher}, ignore_index=True)
        # unnest lists
        df_av = df_av.explode(column='views_for_certain_day_normalized')
    
    #days past from new range we are considering
    df_av['days_past'] = days_past_list
    return df_av

![fig6](https://raw.githubusercontent.com/OweysMomenzada/Evergreen-Content-Classifier-for-german-Text/main/EDA/images/fig6.png)

### Evaluation of the normalization

After the normalization we see that the first three days have a high relevance for evergreen articles. Peaks are mainly caused by a low start, where the average
far below the average of the lifetime of the article. 

Looking at Evergreen articles we see that the first days are not the most relevant ones. 
The reason why it was not shown in the first graph is that some articles such as mittlebayerische
have a lot of views in the first days for their evergreens. However, if we normalize their popularity
we will see that evergreen articles can have their peaks above the first days.

### Results of the timebased EDA
#### pageviews
It is noticeable that both nonevergreen as well as evergreen articles have significant popularity differences between the first four days and the days after. However, the decrease of nonevergreen pageviews are way heavier than Evergreen articles. Thus, we can consider Evergreen articles as more consistent when it comes to pageviews over time. 
In addition to the consistency of evergreen articles, nonevergreen articles have unpredictable peaks. 

It is worth highlighting that all nonevergreens of each publisher almost behave the same. We can see that 'Mittlebayerische'
has the highest popularity. The Münster publisher is almost irrelevant for our EDA as it has not much data to offer.

#### readtime
Various studies have shown that uptodate-readers are under high pressure. Thus, they only read headlines when it comes to nonevergreen articles. Therefore, the readingtime in average (based on this assumption) needs to be significantly lower compared to the evergreen articles, since evergreen articles are timeless.

In our EDA we could successfully observe this behavior. Although nonevergreens have way more pageviews in general, they have a lower readtime (even in the first days).
Evergreen articles have a lot of high peaks. This may be caused by trends or current media interest. Eventbased evergreens also show that they lose on high interest in reading time when the event gets outdated with each day. 

#### Furthermore...
Although nonevergreens have way more pageviews (even in later days) it is worth mentioning, that evergreen readers are way more interested in reading this articles and stay more consistent. Based on pageviews in the first 3 months, we can consider evergreen readers as just a few but loyal.  

![fig7](https://raw.githubusercontent.com/OweysMomenzada/Evergreen-Content-Classifier-for-german-Text/main/EDA/images/fig7.png)

In [21]:
fig = make_subplots(rows=1, cols=1,
                    subplot_titles=("Genre for each evergreen type", "",))

fig.add_trace(
    # get a list of value_counts and convert it to percent
    go.Bar(x=list((ne_content.genre.value_counts()/sum(ne_content.genre.value_counts()))*100),
           # getting the rownames of value_counts
           y= ne_content.genre.value_counts().rename_axis('topics').reset_index(name='counts')['topics'], 
           name='Non-Evergreen',
           marker=dict(color=cols[0]),
           orientation='h'),
    row=1, col=1)

fig.add_trace(
    go.Bar(x=list((ev_content.genre.value_counts()/sum(ev_content.genre.value_counts()))*100),
           y= ev_content.genre.value_counts().rename_axis('topics').reset_index(name='counts')['topics'], 
           name='Event-Evergreen',
           marker=dict(color=cols[8]),
           orientation='h'),
    row=1, col=1)

fig.add_trace(
    # get a list of value_counts and convert it to percent
    go.Bar(x=list((zt_content.genre.value_counts()/sum(zt_content.genre.value_counts()))*100),
           y= zt_content.genre.value_counts().rename_axis('topics').reset_index(name='counts')['topics'], 
           name='Zeitlos-Evergreen',
           marker=dict(color=cols[3]),
           orientation='h'),
    row=1, col=1)

fig.update_layout(yaxis_title='Genre')

fig.write_image("fig8.png")
fig.show()

![fig8](https://raw.githubusercontent.com/OweysMomenzada/Evergreen-Content-Classifier-for-german-Text/main/EDA/images/fig8.png)

![fig9](https://raw.githubusercontent.com/OweysMomenzada/Evergreen-Content-Classifier-for-german-Text/main/EDA/images/fig9.png)

### Contentbased EDA 

A closer analysis shows that many articles are based on the current Corona situation. This causes an 'overfitting' especially in this area and could lead to a too high weighting of the pandemic. Because it is clear that Corona is only a phase (nonevergreen) and thus is seen as a stopword. However, since the body is written in the context of the pandemic, this may not help. Therefore we need other topics that lead to a better overview.