In [1]:
import pandas as pd 
import numpy as np 
from typing import Dict, List, Tuple
import os 
import sqlite3
import time 
from tools.cleaning import remove_duplicated_text, drop_invalid_reply, filter_non_airlines_conversation, add_airlines_id, change_type
from definitions.airlines import airlines_id, airlines_name
import re

In [2]:
#path
path_database = '../database/database_18.db'
path_pickle = '../pickle_files/everything_tweets.p'

In [3]:
#connect to the database 
con = sqlite3.connect(path_database)
cur = con.cursor()

In [4]:
#pull data
query = '''
    SELECT id, 
    in_reply_to_status_id,
    in_reply_to_user_id,
    timestamp_ms,
    user_id,
    text
    FROM general_tweets
'''
df = pd.read_sql_query(query, con)

In [30]:
#fix type for timestamp object
df['timestamp_ms'] = pd.to_datetime(df.timestamp_ms)

In [4]:
#option 2: pull data from pickle file
df = pd.read_pickle(path_pickle)

#### 2. Define new conversation  

In [7]:
class Conversation:
    """ Represent a conversation
    """
    def __init__(
        self, 
        person1_id : int,
        person2_id : int,
        lst_tweets_id: List[int],
        airline : str
    ):
        """initialize a person
        :parameter user id of person 1, user id of person 2, a list of id a tweets in the conversation,
        name of the airline this conversation belong to
        """
        self.person1_id = person1_id
        self.person2_id = person2_id
        self.lst_tweets_id = lst_tweets_id
        self.airline = airline 
        
#     def extract_conversation(df : pd.DataFrame) -> List[Conversation]:
        

#### 3. Extract conversation

In [27]:
df_data = df[[
    'id',
    'user_id',
    'in_reply_to_status_id',
    'in_reply_to_user_id',
]]

In [21]:
df_data = df_data.astype({
    'id' : 'float64',
    'user_id' : 'int64',
    'in_reply_to_status_id' : 'float64',
    'in_reply_to_user_id' : 'float64'
})

In [13]:
#drop invalid reply
df_data = drop_invalid_reply(df_data)

In [14]:
def extract_conversation(df: pd.DataFrame, max_level=7) -> pd.DataFrame:
    df_copy = df[[
        'id',
        'user_id',
        'in_reply_to_status_id',
        'in_reply_to_user_id',
    ]].copy()
    
    #initialize the conversation dataframe
    out = df_copy.merge(df_data, left_on=['id'], right_on=['in_reply_to_status_id'], suffixes=['_1', '_2'])
    out = out.drop(columns=[x for x in out.columns[-2:]])
    conversation_level = 3
    
    #filter out non airlines conversation 
    out = filter_non_airlines_conversation(['user_id_1', 'user_id_2'], out, airlines_id)
    
    #filter out only conversations start from an original tweet or an airlines-crossroads tweet
    or_lst = [tw_id for tw_id in out[out.in_reply_to_status_id_1.isna()].id_1]  # get tweets id of original one
    cr_lst = [tw_id for tw_id in out[out.in_reply_to_user_id_1 == out.user_id_2].id_1] # get tweet airlines-crossroads tweet
    out = out[out['id_1'].isin(set(or_lst).union(set(cr_lst)))]
    
    # extract only the reply tweets to map to original
    df_reply = df_copy.dropna(subset=['in_reply_to_status_id', 'in_reply_to_user_id']) 
    df_reply = df_reply.drop_duplicates(subset=['in_reply_to_status_id', 'user_id'])
    
    #map tweets together into a conversation 
    while not df_copy[df.columns[-1]].isnull().all() and conversation_level <= max_level:
        out = out.merge(df_reply,
           left_on=['id_' + str(conversation_level - 1), 'user_id_' + str(conversation_level - 2)],
           right_on=['in_reply_to_status_id', 'user_id'],
           how='left')
        out = out.drop(columns=[x for x in out.columns[-2:]])
        out = out.rename(columns={'id' : 'id_' + str(conversation_level), 'user_id' : 'user_id_' + str(conversation_level)})
        conversation_level += 1
        print(conversation_level)
    
    #clean the final table 
    out['reply'] = out['in_reply_to_status_id_1'].notna()
    out = out.drop(columns=['in_reply_to_status_id_1', 'in_reply_to_user_id_1'])
    out = add_airlines_id(out, lst=['user_id_1', 'user_id_2'])
    
    #filter out the ones have only 2 tweets involve
    out = out.dropna(subset=['id_4'])
    
    # add a conversation_opener attributes
    lst = []
    for x in out.user_id_1.isin(airlines_id.values()):
        if x:
            lst.append('airline')
        else:
            lst.append('customer')
    out['conversation_opener'] = lst
    
    return out[sorted(out.columns)]


In [8]:
# Extract conversations
conversation = extract_conversation(df_data, max_level=7)

4
5
6
7
8


In [9]:
conversation

Unnamed: 0,airline_id,airline_name,conversation_opener,id_1,id_2,id_3,id_4,id_5,id_6,id_7,reply,user_id_1,user_id_2,user_id_3,user_id_4,user_id_5,user_id_6,user_id_7
0,18332190,British_Airways,customer,1.131173e+18,1.131176e+18,1.131180e+18,1.131316e+18,1.131320e+18,1.131322e+18,,False,1662186764,18332190,1.662187e+09,1.833219e+07,1.662187e+09,18332190.0,
11,18332190,British_Airways,airline,1.131176e+18,1.131180e+18,1.131316e+18,1.131320e+18,1.131322e+18,,,True,18332190,1662186764,1.833219e+07,1.662187e+09,1.833219e+07,,
19,218730857,Qantas,customer,1.131177e+18,1.131180e+18,1.131187e+18,1.131187e+18,1.131188e+18,1.131192e+18,1.131194e+18,True,226833772,218730857,2.268338e+08,2.187309e+08,2.268338e+08,218730857.0,226833772.0
20,20626359,VirginAtlantic,customer,1.131177e+18,1.131183e+18,1.131185e+18,1.131193e+18,,,,False,22672731,20626359,2.267273e+07,2.062636e+07,,,
23,22536055,AmericanAir,customer,1.131178e+18,1.131186e+18,1.131384e+18,1.131387e+18,1.131571e+18,,,False,901094791093653504,22536055,9.010948e+17,2.253606e+07,9.010948e+17,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16326,18332190,British_Airways,customer,1.135087e+18,1.135091e+18,1.135092e+18,1.135097e+18,1.135098e+18,,,True,1143830054,18332190,1.143830e+09,1.833219e+07,1.143830e+09,,
16330,18332190,British_Airways,airline,1.135089e+18,1.135089e+18,1.135092e+18,1.135094e+18,,,,True,18332190,949689445539831808,1.833219e+07,9.496894e+17,,,
16333,38676903,easyJet,customer,1.135090e+18,1.135097e+18,1.135098e+18,1.135100e+18,,,,True,143842056,38676903,1.438421e+08,3.867690e+07,,,
16336,18332190,British_Airways,airline,1.135091e+18,1.135092e+18,1.135097e+18,1.135098e+18,,,,True,18332190,1143830054,1.833219e+07,1.143830e+09,,,
