In [56]:
import pandas as pd 
import numpy as np 
from typing import Dict, List, Tuple
import os 
import sqlite3
import time 
from Tools.Cleaning import remove_duplicated_text, drop_invalid_reply, filter_non_airlines_conversation, add_airlines_id, change_type
from definitions.airlines import airlines_id, airlines_name
import re

In [57]:
#path
path_database = '../database/database.db'
path_pickle = '../pickle_files/everything_tweets.p'

In [58]:
#connect to the database 
con = sqlite3.connect(path_database)
cur = con.cursor()

In [104]:
#pull data
query = '''
    SELECT id, 
    in_reply_to_status_id,
    in_reply_to_user_id,
    timestamp_ms,
    user_id,
    text
    FROM general_tweets
'''
df = pd.read_sql_query(query, con)

In [105]:
#fix type for timestamp object
df['timestamp_ms'] = pd.to_datetime(df.timestamp_ms)

In [None]:
#option 2: pull data from pickle file
df = pd.read_pickle(path_pickle)

#### 2. Define new conversation  

In [7]:
class Conversation:
    """ Represent a conversation
    """
    def __init__(
        self, 
        person1_id : int,
        person2_id : int,
        lst_tweets_id: List[int],
        airline : str
    ):
        """initialize a person
        :parameter user id of person 1, user id of person 2, a list of id a tweets in the conversation,
        name of the airline this conversation belong to
        """
        self.person1_id = person1_id
        self.person2_id = person2_id
        self.lst_tweets_id = lst_tweets_id
        self.airline = airline 
        
#     def extract_conversation(df : pd.DataFrame) -> List[Conversation]:
        

#### 3. Extract conversation

In [101]:
df = df.replace(to_replace='', value=np.NaN)

In [102]:
df_data = df[[
    'id',
    'user_id',
    'in_reply_to_status_id',
    'in_reply_to_user_id',
]]

In [109]:
df_data = df_data.astype({
    'id' : 'float64',
    'user_id' : 'float64',
    'in_reply_to_status_id' : 'float64',
    'in_reply_to_user_id' : 'float64'
})

df_data = df_data.astype({
    'user_id' : 'Int64'
})

In [97]:
#drop invalid reply
df_data = drop_invalid_reply(df_data)

In [103]:
df_data[df_data.user_id.isna()]

Unnamed: 0,id,user_id,in_reply_to_status_id,in_reply_to_user_id
2583570,1.2387358628437443e+18,,1.2387354695481303e+18,132903712.0
2583571,1.2387358903332618e+18,,,
2583572,1.2387359183511634e+18,,1.2387355800344576e+18,17626165.0
2583573,1.2387359259428536e+18,,1.238719154703696e+18,200816053.0
2583574,1.2387359272095744e+18,,1.238735200093438e+18,1662186764.0
...,...,...,...,...
2918235,1.244696588255146e+18,,,
2918236,1.2446966311671235e+18,,,
2918237,1.2446966414011638e+18,,1.24467730459861e+18,396021583.0
2918238,1.2446966829793034e+18,,1.2446944531908977e+18,521835883.0


In [79]:
def extract_conversation(df: pd.DataFrame, max_level=7) -> pd.DataFrame:
    df_copy = df[[
        'id',
        'user_id',
        'in_reply_to_status_id',
        'in_reply_to_user_id',
    ]].copy()
    
    #initialize the conversation dataframe
    out = df_copy.merge(df_data, left_on=['id'], right_on=['in_reply_to_status_id'], suffixes=['_1', '_2'])
    out = out.drop(columns=[x for x in out.columns[-2:]])
    conversation_level = 3
    
    #filter out non airlines conversation 
    out = filter_non_airlines_conversation(['user_id_1', 'user_id_2'], out, airlines_id)
    
    #filter out only conversations start from an original tweet or an airlines-crossroads tweet
    or_lst = [tw_id for tw_id in out[out.in_reply_to_status_id_1.isna()].id_1]  # get tweets id of original one
    cr_lst = [tw_id for tw_id in out[out.in_reply_to_user_id_1 == out.user_id_2].id_1] # get tweet airlines-crossroads tweet
    out = out[out['id_1'].isin(set(or_lst).union(set(cr_lst)))]
    
    # extract only the reply tweets to map to original
    df_reply = df_copy.dropna(subset=['in_reply_to_status_id', 'in_reply_to_user_id']) 
    df_reply = df_reply.drop_duplicates(subset=['in_reply_to_status_id', 'user_id'])
    
    #map tweets together into a conversation 
    while not df_copy[df.columns[-1]].isnull().all() and conversation_level <= max_level:
        out = out.merge(df_reply,
           left_on=['id_' + str(conversation_level - 1), 'user_id_' + str(conversation_level - 2)],
           right_on=['in_reply_to_status_id', 'user_id'],
           how='left')
        out = out.drop(columns=[x for x in out.columns[-2:]])
        out = out.rename(columns={'id' : 'id_' + str(conversation_level), 'user_id' : 'user_id_' + str(conversation_level)})
        conversation_level += 1
        print(conversation_level)
    
    #clean the final table 
    out['reply'] = out['in_reply_to_status_id_1'].notna()
    out = out.drop(columns=['in_reply_to_status_id_1', 'in_reply_to_user_id_1'])
    out = add_airlines_id(out, lst=['user_id_1', 'user_id_2'])
    
    #filter out the ones have only 2 tweets involve
    out = out.dropna(subset=['id_4'])
    
    # add a conversation_opener attributes
    lst = []
    for x in out.user_id_1.isin(airlines_id.values()):
        if x:
            lst.append('airline')
        else:
            lst.append('customer')
    out['conversation_opener'] = lst
    
    return out[sorted(out.columns)]


In [80]:
# Extract conversations
conversation = extract_conversation(df_data, max_level=7)

4
5
6
7
8


In [81]:
conversation

Unnamed: 0,airline_id,airline_name,conversation_opener,id_1,id_2,id_3,id_4,id_5,id_6,id_7,reply,user_id_1,user_id_2,user_id_3,user_id_4,user_id_5,user_id_6,user_id_7
36669,22536055,AmericanAir,airline,1.188868e+18,1.188869e+18,1.18887e+18,1.18887e+18,,,,False,22536055,34345607,22536055,34345607,,,
