## Imports

In [1]:
import json
import re
import io

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

## Functions

In [2]:
def load_to_dataframe(sample_file):
    
    json_data = []
    for line in sample_file:
        json_data.append(json.loads(line))

    hashtags, ht_count, urls, media = [], [], [], []

    for line in json_data:
        try:
            if line['place']['country_code'] == 'CH':
                ht = []
                for hashtag in line['entities']['hashtags']:
                    ht.append(hashtag['text'])
                hashtags.append(ht)
                ht_count.append(len(ht))
                
                media_count, urls_count = 0, 0
                urls_count += len(line['entities']['urls'])
            
                try:
                    media_count += len(line['extended_entities']['media'])
                except:
                    media_count += 0
                
                urls.append(urls_count)
                media.append(media_count) 
                
        except:
            continue
            
    df = {'hashtags': hashtags, 'hashtag_count': ht_count, 'urls': urls, 'media': media}
    df = pd.DataFrame(df)
    return df

In [3]:
def print_stats(dfs, names):

    print('===== Total number of Tweets =====')
    for i in range(len(dfs)):
        print('Sample', names[i] + ':', len(dfs[i]))

    print('')
    print('===== Total number of Tweets with hashtags =====')
    for i in range(len(dfs)):
        count = len(dfs[i][dfs[i]['hashtag_count'] > 0])
        print('Sample', names[i] + ':', count, '(' +  str(100*count/len(dfs[i]))[:5] + '% of total)')
        
    print('')
    print('===== Total number of Tweets with two or more hashtags =====')
    for i in range(len(dfs)):
        count = len(dfs[i][dfs[i]['hashtag_count'] >= 2])
        print('Sample', names[i] + ':', count, '(' +  str(100*count/len(dfs[i]))[:5] + '% of total)')

    print('')
    print('===== Total number of Tweets that contain media =====')
    for i in range(len(dfs)):
        count = len(dfs[i][dfs[i]['media'] > 0])
        print('Sample', names[i] + ':', count, '(' +  str(100*count/len(dfs[i]))[:5] + '% of total)')

    print('')
    print('===== Total number of Tweets that contain URLs =====')
    for i in range(len(dfs)):
        count = len(dfs[i][dfs[i]['urls'] > 0])
        print('Sample', names[i] + ':', count, '(' +  str(100*count/len(dfs[i]))[:5] + '% of total)')
    
    print('')

In [4]:
def top_hashtags(df):
    
    ht_list_1 = []

    for ind, item in df['hashtags'].iteritems():
        for hashtag in item:
            ht_list_1.append('#' + hashtag.capitalize())

    top = pd.Series(ht_list_1).value_counts().reset_index().rename(
                    columns = {'index': 'hashtag', 0:'frequency'})
    top['rank'] = np.arange(1, len(top) + 1)
    top = top[['rank', 'hashtag', 'frequency']]
    
    return top[:50]

In [5]:
def top_binomes(df):
    ht_list_1 = []

    for ind, item in df[df['hashtag_count'] > 1]['hashtags'].iteritems():
        sorted_ht, done = [], []
        for tag in item:
            sorted_ht.append(tag.capitalize())
        sorted_ht = sorted(sorted_ht)
        for hashtag_1 in sorted_ht:
            for hashtag_2 in sorted_ht:
                if (hashtag_1 != hashtag_2) & ~(('#' + hashtag_2 + ' #' + hashtag_1) in done):
                    ht_list_1.append('#' + hashtag_1 + ' #' + hashtag_2)
                    done.append('#' + hashtag_1 + ' #' + hashtag_2)

    binomes = pd.Series(ht_list_1).value_counts().reset_index().rename(
                    columns = {'index': 'hashtag pair', 0:'frequency'})
    binomes['rank'] = np.arange(1, len(binomes) + 1)
    binomes = binomes[['rank', 'hashtag pair', 'frequency']]
    
    return binomes[:50]

In [6]:
def compare_hashtags(df_1, df_2, N):
    
    for n in N:
        top_df_1 = top_hashtags(df_1)[:n]
        top_df_2 = top_hashtags(df_2)[:n]

        intersection = pd.merge(top_df_1, top_df_2, on = 'hashtag', how = 'inner')
        union = pd.merge(top_df_1, top_df_2, on = 'hashtag', how = 'outer')

        print('Number of common hashtags in top', str(n) + ':', str(len(intersection))[:4])
        print('Ratio of common hashtags in top', str(n) + ':', str(len(intersection)/len(union))[:4], '\n')
        

## Load samples

In [7]:
file_1 = open('sample_1.json', 'r')
file_2 = open('sample_2.json', 'r')
file_19 = open('shared_csm_2019/data.json', 'r')

In [8]:
df_1 = load_to_dataframe(file_1)
df_2 = load_to_dataframe(file_2)
df_19 = load_to_dataframe(file_19)

## Part 1.3 & 2.3

#### Basic statistics

In [9]:
print_stats([df_1, df_2, df_19], ['2016 Nr.1', '2016 Nr.2', '2019'])

===== Total number of Tweets =====
Sample 2016 Nr.1: 13636
Sample 2016 Nr.2: 13804
Sample 2019: 9782

===== Total number of Tweets with hashtags =====
Sample 2016 Nr.1: 3189 (23.38% of total)
Sample 2016 Nr.2: 3294 (23.86% of total)
Sample 2019: 1662 (16.99% of total)

===== Total number of Tweets with two or more hashtags =====
Sample 2016 Nr.1: 1508 (11.05% of total)
Sample 2016 Nr.2: 1613 (11.68% of total)
Sample 2019: 926 (9.466% of total)

===== Total number of Tweets that contain media =====
Sample 2016 Nr.1: 2239 (16.41% of total)
Sample 2016 Nr.2: 2328 (16.86% of total)
Sample 2019: 1200 (12.26% of total)

===== Total number of Tweets that contain URLs =====
Sample 2016 Nr.1: 3712 (27.22% of total)
Sample 2016 Nr.2: 3822 (27.68% of total)
Sample 2019: 3706 (37.88% of total)



##### Discussion
In order to fit the research question, among all the tweets in the dataset, I decided to keep only tweets that were really localized in Switzerland (and not in any of the neighbouring country). After this step, the 2016 dataset contains about 27k tweets, while the 2019 dataset contains 9.8k tweets. One should keep in mind these numbers, which signify first that both dataset are quite unequal, and that the analysis relies in both cases on a reduced number of tweets. Furthermore, the 2019 tweets were collected over one week, which is also a very short period of time.

The first trends one might hightlight from these results is that the use of hashtags seem to decrease between 2016 and 2019. On the other hand, the rate of multiple hashtags seems to be holding relatively well, which means that people are generally less likely to use hashtags, but when they use them, are more likely to use several. Hashtags mainly increase the visibility of a tweet, and obviously focusing on several different hashtags allows you to cover a larger audience. The increased understanding of the hashtag concept by a group of users may have caused this evolution in practice. On the other hand, the format of tweets evolved between 2016 and 2019 (Nov. 2017) from the legendary 140-character limit to 280 characters today. This obviously leaves more space for several hashtags, for users who use them.

On the other hand, people tend to include fewer media in their tweets. This may be due to the rise of Instagram and Snapchat in particular, which are now the main channels for sharing images and videos. In terms of videos, Youtube is also in a strong position right now. Again, the increase in the character limit has also made it possible to create more elaborate textual tweets, perhaps at the expense of impact tweets, in which the inclusion of an image could reinforce a relatively short argument.

On the other hand, url sharing seems to have increased between 2016 and 2019. This seems to indicate a strengthening of Twitter's position as an information-sharing network. Competition from other networks may have forced this policy of specialization. As well as the fact that it is now possible to develop a more complex argument.

#### Top 50 hashtags in sample 2016 Nr 1

In [10]:
top_hashtags(df_1)

Unnamed: 0,rank,hashtag,frequency
0,1,#Switzerland,149
1,2,#Canyoumakeit16,71
2,3,#Zurich,65
3,4,#Job,63
4,5,#Geneva,61
5,6,#Hiring,54
6,7,#Controleurs,38
7,8,#Careerarc,36
8,9,#Basel,35
9,10,#Jobs,34


#### Top 50 hashtags in sample 2016 Nr 2

In [11]:
top_hashtags(df_2)

Unnamed: 0,rank,hashtag,frequency
0,1,#Switzerland,153
1,2,#Canyoumakeit16,82
2,3,#Zurich,77
3,4,#Geneva,63
4,5,#Job,55
5,6,#Hiring,47
6,7,#Controleurs,42
7,8,#Basel,37
8,9,#Askbelieber,36
9,10,#Quiz,35


##### Discussion
The presence of city names (Geneva, Zurich, Valais, etc.) is not surprising, especially when it comes to geolocated tweets. It can be assumed that users indicating the location of the tweets are also more likely to reinforce the importance of the place from which they post in the tweet content. One can also notice the presence of hashtags linked to specific events (#Tdr2016, #Ghf16), or recurring events (#Srfarena, #Thevoice). In these datasets, the presence of promotional campaigns is also important (#Careerarc, #Ecommerce, #Businessmontres). Suprising is the presence of an arabic hashtag, which might be part of an activist campaign, or something similar. Finally, the rest of hashtags mainly focus on more general concepts (#Beautiful, #Snow, #Spring).

The two datasets are very similar, and it is clearly visible that both were collected at the same time. However, one also see that the collection of tweets is relatively random, and even if the two samples are close, they are not identical, so that the hashtag ranks vary. This is a factor of variability of results that must be taken into account.

#### Top 50 hashtags in sample 2019

In [12]:
top_hashtags(df_19)

Unnamed: 0,rank,hashtag,frequency
0,1,#Switzerland,124
1,2,#Suisse,52
2,3,#Schweiz,33
3,4,#Montreux,30
4,5,#Cdn2019,26
5,6,#Cdn,25
6,7,#Coupedesnations,25
7,8,#Easter,25
8,9,#Travel,25
9,10,#Zurich,24


##### Discussion
Again, cities and mentions of Swiss cantons are very present. Several hashtags mention the UEFA League of Nations directly or indirectly (#Cdn2019, #Coupedesnations). This can also explain the very high number of hashtags about Switzerland (#Svizzera, #Switzerland, #Swiss, #Schweiz, #Suisse, #Ch) which is part of this competition. Other events are also mentionned (#Eurovision, #Orca_swiss2019, #Easter, # Ostern), as well as international actuality (#Srilanka), and TV shows (#Ciaodarwin, #Hercai). One can also only note the number of hashtags mentioning beer (including in Russian and Japanese). After a short qualitative analysis, all these hashtags are the work of posts of a single user. This shows the limitation of focusing only on a small country, a short period of time and only on geolocated tweets.

Even if the data were collected over one week, the presence of these events has a very clear impact on the results, and somewhat confuses this top 50. It also shows the importance of events on Twitter, and suggests that it may only take a few dozen Tweets to launch a national hashtag trending in a country like Switzerland. Once again, the character of Twitter as a quick source of information is highlighted.

#### Top 50 hashtag pairs in sample 2016 Nr 1

In [13]:
top_binomes(df_1)

Unnamed: 0,rank,hashtag pair,frequency
0,1,#Hiring #Job,52
1,2,#Careerarc #Job,36
2,3,#Careerarc #Hiring,36
3,4,#Job #Jobs,31
4,5,#Hiring #Jobs,26
5,6,#Geneva #Job,19
6,7,#Careerarc #Jobs,19
7,8,#Job #Sales,17
8,9,#Swiss #Switzerland,16
9,10,#Geneva #Hiring,15


#### Top 50 hashtag pairs in sample 2016 Nr 2

In [14]:
top_binomes(df_2)

Unnamed: 0,rank,hashtag pair,frequency
0,1,#Hiring #Job,47
1,2,#Job #Jobs,30
2,3,#Careerarc #Job,29
3,4,#Careerarc #Hiring,29
4,5,#Hiring #Jobs,29
5,6,#Careerarc #Jobs,19
6,7,#Switzerland #Zurich,19
7,8,#Swiss #Switzerland,17
8,9,#Geneva #Switzerland,16
9,10,#Switzerland #Travel,13


#### Top 50 hashtag pairs in sample 2019

In [15]:
top_binomes(df_19)

Unnamed: 0,rank,hashtag pair,frequency
0,1,#Cdn #Cdn2019,25
1,2,#Cdn #Coupedesnations,25
2,3,#Cdn2019 #Coupedesnations,25
3,4,#Cdn2019 #Montreux,24
4,5,#Coupedesnations #Since1921,24
5,6,#Cdn #Since1921,24
6,7,#Cdn2019 #Since1921,24
7,8,#Cdn #Montreux,23
8,9,#Montreux #Since1921,23
9,10,#Coupedesnations #Montreux,23


##### Discussion
Hashtag couples are mainly linked to promotional campaigns and events. Whether it is campaigns of job recruiters or beer advertising, the fact that some of these tweets can sometimes include 10 or 20 hashtags multiplies their presence in top pairs of hashtags. The stated goal of this type of tweets is to create trending hashtags and capture as much attention as possible. 

However, it is important to note that Twitter algorithms are more advanced than this simple display of the most commonly used hashtags, and therefore can't be fooled by such tweeting machine gun strategy. A single user cannot create a trend on Twitter, even if he posts the same tweet 50 times. These top lists therefore do not necessarily represent the hashtags that had the most impact on Twitter during the period studied, but really only the raw quantitative information.

## Part 2.4

In [16]:
compare_hashtags(df_1, df_2, [20, 50])

Number of common hashtags in top 20: 16
Ratio of common hashtags in top 20: 0.66 

Number of common hashtags in top 50: 41
Ratio of common hashtags in top 50: 0.69 



In [17]:
compare_hashtags(df_1, df_19, [20, 50])

Number of common hashtags in top 20: 7
Ratio of common hashtags in top 20: 0.21 

Number of common hashtags in top 50: 16
Ratio of common hashtags in top 50: 0.19 



In [18]:
compare_hashtags(df_2, df_19, [20, 50])

Number of common hashtags in top 20: 6
Ratio of common hashtags in top 20: 0.17 

Number of common hashtags in top 50: 16
Ratio of common hashtags in top 50: 0.19 



##### Discussion
Again, the similarity between the two 2016 samples is observed. However, the recovery is by far not complete, which illustrates the bias induced by the fact that only a sample of the tweets posted can be collected by the API. In that case, only 66-69% of the top hashtags are similar, which clearly shows that this sampling effect is not only reflected on individual tweets but also clearly on global trends.

Recovery between 2016 and 2019, however, is much lower. Most of the recovery is due to location mentions - cities, cantons and mentions of Switzerland - rather than seasonal events. The fact that one-off events (tournaments, TV shows, news) represent the bulk of top tweets is undoubtedly the cause of this relatively low recovery.

### Conclusion
This brief analysis clearly highlights the role of Twitter as an information-sharing medium. Other aspects are also emerging, such as being able to quickly comment on an event or news, interact with a TV show via Twitter or even as marketing tool. The content, focused mainly on one-off events, changes very significantly from one year to the next. We also notice tthehis strengthening of Twitter in this role of information sharing media through the evolution of the use of meta-elements in tweets. More and more url is used, but less and less media has been used over the last three years. The number of hashtags has also decreased, probably because this element that was Twitter's trademark has spread to other platforms.

The importance of events also shows that Twitter as a social network is based on the creation of communities that are directly based on the shared interest of users for a certain type of news, and therefore also on events. One will follow users who share information that he thinks is relevant, and at the same time seeks to be recognized by his peers in the sharing of a certain kind of information. Hashtags are a good mean to be recognized during local events (football, TV political debate, etc.) as a quality influential on certain kind of news. What is more, for events such as the #Coupedesnations, Twitter is a very fast and reactive source of information, and allows to crowdsource a lot of data on the event in progress, such as photos, comments, etc. For this type of event, as was also the case during the Notre-Dame fire a few days earlier, even traditional media use user tweets to cover the event.

Conversely, local events are also a good opportunity to create common and focused hashtags, which will in turn facilitate impromptu interaction with other users. One could argue however that it mainly allows one to get in touch with other people with significant homophilia. Indeed, users who post with hashtag #srfarena are likely to look more alike than those who post with hashtag #politics or #debate, especially because they are more likely to already share common contacts with you.