Process:
- Determine which tweets are written in English based on language feature
- Preprocess tweet text for topic modeling
- Collect all hashtags/mentions and detected domains/entities from tweets
- Sort out replies? (if it starts with a @)

In [1]:
import json
import os
import pandas as pd

In [2]:
# Read in an example json
path_example_json = "airline_tweets_all/KLM/KLM_1579833777777307650.json"

with open(path_example_json) as f:
    tweet_json = json.load(f)

In [3]:
df_airline_tweets.head()

Unnamed: 0,tweet_id,created_at,text,source,lang,retweet_count,reply_count,like_count,quote_count,detected_domain_entity_pairs,annots_found,hashtags,mentions
0,1585311374812811264,2022-10-26T16:44:20.000Z,@briandavids Hi there. We'd be glad to take a ...,Khoros CX,en,0,0,0,0,"{Brand_United Airlines, Brand Vertical_Transpo...",{},{},{briandavids}
1,1585292688873893888,2022-10-26T15:30:04.000Z,@KDiesel63 Hi Kevin. We'd be happy to help. Pl...,Khoros CX,en,0,1,0,0,"{Brand_United Airlines, Brand Vertical_Transpo...","{Other_JH, Person_Kevin}",{},{KDiesel63}
2,1585290601582718976,2022-10-26T15:21:47.000Z,@CakeyHayley Please DM your confirmation numbe...,Khoros CX,en,0,0,1,0,"{Brand_United Airlines, Brand Vertical_Transpo...",{},{},{CakeyHayley}
3,1585287554093711360,2022-10-26T15:09:40.000Z,"@CakeyHayley Hello, Hayley. Thank you for reac...",Khoros CX,en,0,1,2,0,"{Brand_United Airlines, Brand Vertical_Transpo...",{Person_Hayley},{},{CakeyHayley}
4,1585284494864498694,2022-10-26T14:57:31.000Z,@GregACRODUNK Hi Greg. If you need assistance ...,Khoros CX,en,0,0,0,0,"{Brand_United Airlines, Brand Vertical_Transpo...",{Person_Greg},{},{GregACRODUNK}


In [10]:
df_airline_tweets.shape

(11129, 14)

In [2]:
airline_tweets_dir = 'airline_tweets_all_NO_REPLIES'

In [6]:
sorted(os.listdir(airline_tweets_dir))[:]

['AerLingus',
 'Aeroflot_World',
 'AeromexicoUSA',
 'AirCanada',
 'AirFranceIN',
 'AirFranceNG',
 'AirFranceUK',
 'AirFranceUS',
 'Air_Dolomiti',
 'AlaskaAir',
 'Allegiant',
 'AmericanAir',
 'AsianaAirlines',
 'AviancaNAM',
 'British_Airways',
 'CEAirglobal',
 'CebuPacificAir',
 'Condor_America',
 'Delta',
 'EGYPTAIR',
 'ELALUSA',
 'Finnair',
 'FlairAirlines',
 'FlyANA_official',
 'FlyAirNZ',
 'FlyFrontier',
 'FlySWISS',
 'GoFirstairways',
 'GulfAir',
 'HawaiianAir',
 'Iberia_en',
 'IndiGo6E',
 'IndonesiaGaruda',
 'JazeeraAirways',
 'JetBlue',
 'JetstarAirways',
 'Jetstar_Asia',
 'KLM',
 'KoreanAir_KE',
 'KuwaitAirways',
 'LATAMAirlinesUS',
 'Lufthansa_USA',
 'MAS',
 'MEAAIRLIBAN',
 'NokAirlines_ENG',
 'Qantas',
 'RAM_Maroc',
 'RoyalJordanian',
 'Ryanair',
 'SAS',
 'SaudiAirlinesEn',
 'SingaporeAir',
 'SouthwestAir',
 'SpiritAirlines',
 'ThaiAirways',
 'TurkishAirlines',
 'VietjetairCom',
 'VietnamAirlines',
 'VirginAtlantic',
 'WestJet',
 '_austrian',
 'airBaltic',
 'airarabiagroup',


In [7]:
# Read tweets and save to CSV files

df_columns = [
    'tweet_id', 
    'user',
    'created_at', 'text', 'is_reply',
    'source', 'lang',
    'retweet_count', 'reply_count', 'like_count', 'quote_count',
    'detected_domain_entity_pairs',
    'annots_found', 'hashtags', 'mentions'
]
for num, airline_twitter_handle in enumerate(sorted(os.listdir(airline_tweets_dir))[:1]):
    
    # if num < 8:
    #     continue
    
    print(num)
    print("Current airline:", airline_twitter_handle)
    
    this_airline_dir = airline_tweets_dir + '/' + airline_twitter_handle
    
    # Create dataframe per airline to keep tweet info
    df_airline_tweets = pd.DataFrame(columns=df_columns)
    
    for path_tweet_json in os.listdir(this_airline_dir):
        this_json_path = this_airline_dir + '/' + path_tweet_json
        
        # Read tweet json
        try:
            with open(this_json_path) as f:
                tweet_json = json.load(f)
        except IsADirectoryError:
            continue
            
        print(f"\tReading tweets and info from {this_json_path}...")
        
        # Each json has up to 100 tweets
        if not tweet_json.get('data'):
            continue
        
        for tweet in tweet_json['data']:
            
            # Skip tweets that are not in English (lang = en)
            if tweet['lang'] != 'en':
                continue
            
            tweet_info = {
                'tweet_id': tweet['id'],
                'user': airline_twitter_handle,
                'created_at': tweet['created_at'],
                'text': tweet['text'],
                'is_reply': 1 if tweet['text'][0] == "@" else 0,
                'source': tweet['source'],
                'lang': tweet['lang']
            }
            
            # Public metrics
            tweet_public_metrics = tweet.get('public_metrics')
            metrics_lst = ['retweet_count', 'reply_count', 'like_count', 'quote_count']
            for mtrc in metrics_lst:
                if tweet_public_metrics:
                    tweet_info[mtrc] = tweet_public_metrics.get(mtrc)
                else:
                    tweet_info[mtrc] = 0
            
            # Detected domains/entities
            detected_domain_entity_pairs = set()
            
            tweet_context_annotations = tweet.get('context_annotations')
            if tweet_context_annotations:
                for cnt_ann in tweet_context_annotations:
                    detected_domain_entity_pairs.add(f"{cnt_ann['domain']['name']}_{cnt_ann['entity']['name']}")
                
            tweet_info['detected_domain_entity_pairs'] = detected_domain_entity_pairs
            
            # Annotations, hashtags and mentions
            entities_dict = tweet.get('entities')
            if entities_dict:
                
                # "annotations"
                entities_annot = entities_dict.get('annotations')
                annots_found = set()
                if entities_annot:
                    for annot in entities_annot:
                        annots_found.add(f"{annot['type']}_{annot['normalized_text']}")
                tweet_info['annots_found'] = annots_found
                    
                # "hashtags"
                entities_hashtags = entities_dict.get('hashtags')
                hashtags_found = set()
                if entities_hashtags:
                    for hsh in entities_hashtags:
                        hashtags_found.add(hsh['tag'])
                tweet_info['hashtags'] = hashtags_found
                
                # "mentions"
                entities_mentions = entities_dict.get('mentions')
                mentions_found = set()
                if entities_mentions:
                    for mntn in entities_mentions:
                        mentions_found.add(mntn['username'])
                tweet_info['mentions'] = mentions_found
        
            # df_airline_tweets = df_airline_tweets.append(tweet_info, ignore_index=True)
            # df_airline_tweets = pd.concat([df_airline_tweets, pd.Series(list(tweet_info.values()))])
            new_df = pd.DataFrame([tweet_info])
            df_airline_tweets = pd.concat([df_airline_tweets, new_df], axis=0, ignore_index=True)
            
        print(f"\t\tFinished reading {tweet_json['meta']['result_count']} tweets and info from {this_json_path}.")
        print(df_airline_tweets.shape)
        
    path_save_airline_df_csv = f'airline_tweets_csvs/{airline_twitter_handle}.csv.gz'
    print(f"\tSaving dataframe for @{airline_twitter_handle} tweets to {path_save_airline_df_csv}...")
    
    df_airline_tweets.to_csv(path_save_airline_df_csv, compression='gzip', index=False)
    print("\tSaved.")
 
            
        
        
        

0
Current airline: AerLingus
	Reading tweets and info from airline_tweets_all_NO_REPLIES/AerLingus/AerLingus_1068186029801316352.json...
		Finished reading 100 tweets and info from airline_tweets_all_NO_REPLIES/AerLingus/AerLingus_1068186029801316352.json.
(94, 15)
	Reading tweets and info from airline_tweets_all_NO_REPLIES/AerLingus/AerLingus_1204136917035618305.json...
		Finished reading 100 tweets and info from airline_tweets_all_NO_REPLIES/AerLingus/AerLingus_1204136917035618305.json.
(192, 15)
	Reading tweets and info from airline_tweets_all_NO_REPLIES/AerLingus/AerLingus_1425398078803222528.json...
		Finished reading 100 tweets and info from airline_tweets_all_NO_REPLIES/AerLingus/AerLingus_1425398078803222528.json.
(291, 15)
	Reading tweets and info from airline_tweets_all_NO_REPLIES/AerLingus/AerLingus_1481241180042059781.json...
		Finished reading 100 tweets and info from airline_tweets_all_NO_REPLIES/AerLingus/AerLingus_1481241180042059781.json.
(391, 15)
	Reading tweets and 

In [8]:
list({'a': 1, 'b': 2}.values())

[1, 2]

In [32]:
entities_mentions

[{'start': 6, 'end': 16, 'username': 'igairport', 'id': '3989295160'}]

In [22]:
tweet_json['data'][0]

{'entities': {'mentions': [{'start': 0,
    'end': 12,
    'username': 'lukecasserl',
    'id': '71780810'}],
  'annotations': [{'start': 16,
    'end': 19,
    'probability': 0.7521,
    'type': 'Person',
    'normalized_text': 'Luke'}]},
 'edit_history_tweet_ids': ['1579833777777307650'],
 'context_annotations': [{'domain': {'id': '45',
    'name': 'Brand Vertical',
    'description': 'Top level entities that describe a Brands industry'},
   'entity': {'id': '781974597302226944', 'name': 'Transportation'}},
  {'domain': {'id': '30',
    'name': 'Entities [Entity Service]',
    'description': 'Entity Service top level domain, every item that is in Entity Service should be in this domain'},
   'entity': {'id': '781974596144668673', 'name': 'Airline - Transportation'}},
  {'domain': {'id': '46',
    'name': 'Business Taxonomy',
    'description': 'Categories within Brand Verticals that narrow down the scope of Brands'},
   'entity': {'id': '1557696420500541440',
    'name': 'Automotive,

In [25]:
tweet_json['data'][0]['entities']

{'mentions': [{'start': 0,
   'end': 12,
   'username': 'lukecasserl',
   'id': '71780810'}],
 'annotations': [{'start': 16,
   'end': 19,
   'probability': 0.7521,
   'type': 'Person',
   'normalized_text': 'Luke'}]}

In [9]:
tweet_json['data'][0]['lang']

'en'

In [10]:
tweet_json['data'][0]['id']

'1579833777777307650'

In [11]:
tweet_json['data'][0].keys()

dict_keys(['entities', 'edit_history_tweet_ids', 'context_annotations', 'source', 'created_at', 'public_metrics', 'lang', 'in_reply_to_user_id', 'text', 'id', 'conversation_id', 'author_id'])

In [12]:
tweet_json['data'][0]['source']

'Salesforce - Social Studio'

In [13]:
tweet_json['data'][0]['created_at']

'2022-10-11T13:58:19.000Z'

In [14]:
tweet_json['data'][0]['public_metrics']

{'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}

In [20]:
tweet_json['data'][0]['context_annotations'][0]['domain']['name']

'Brand Vertical'

In [21]:
tweet_json['data'][0]['context_annotations'][0]['domain']['description']

'Top level entities that describe a Brands industry'

In [19]:
tweet_json['data'][0]['context_annotations'][0]['entity']['name']

'Transportation'

In [6]:
tweet_json

{'data': [{'entities': {'mentions': [{'start': 0,
      'end': 12,
      'username': 'lukecasserl',
      'id': '71780810'}],
    'annotations': [{'start': 16,
      'end': 19,
      'probability': 0.7521,
      'type': 'Person',
      'normalized_text': 'Luke'}]},
   'edit_history_tweet_ids': ['1579833777777307650'],
   'context_annotations': [{'domain': {'id': '45',
      'name': 'Brand Vertical',
      'description': 'Top level entities that describe a Brands industry'},
     'entity': {'id': '781974597302226944', 'name': 'Transportation'}},
    {'domain': {'id': '30',
      'name': 'Entities [Entity Service]',
      'description': 'Entity Service top level domain, every item that is in Entity Service should be in this domain'},
     'entity': {'id': '781974596144668673',
      'name': 'Airline - Transportation'}},
    {'domain': {'id': '46',
      'name': 'Business Taxonomy',
      'description': 'Categories within Brand Verticals that narrow down the scope of Brands'},
     'entit