In [255]:
import pandas as pd 
import numpy as np 
from typing import Dict, List, Tuple
import os 
import sqlite3
import time 
from tools.cleaning import remove_duplicated_text, drop_invalid_reply, filter_non_airlines_conversation
from definitions.airlines import airlines_id, airlines_name
import re

In [84]:
#path
path_database = '../database/database_18.db'
path_pickle = '../pickle_files/everything_tweets.p'

In [3]:
#connect to the database 
con = sqlite3.connect(path_database)
cur = con.cursor()

In [4]:
#pull data
query = '''
    SELECT id, 
    in_reply_to_status_id,
    in_reply_to_user_id,
    timestamp_ms,
    user_id,
    text
    FROM general_tweets
'''
df = pd.read_sql_query(query, con)

In [5]:
#fix type for timestamp object
df['timestamp_ms'] = pd.to_datetime(df.timestamp_ms)

In [85]:
#option 2: pull data from pickle file
df = pd.read_pickle(path_pickle)

#### 2. Define new conversation  

In [7]:
class Conversation:
    """ Represent a conversation
    """
    def __init__(
        self, 
        person1_id : int,
        person2_id : int,
        lst_tweets_id: List[int],
        airline : str
    ):
        """initialize a person
        :parameter user id of person 1, user id of person 2, a list of id a tweets in the conversation,
        name of the airline this conversation belong to
        """
        self.person1_id = person1_id
        self.person2_id = person2_id
        self.lst_tweets_id = lst_tweets_id
        self.airline = airline 
        
#     def extract_conversation(df : pd.DataFrame) -> List[Conversation]:
        

In [8]:
#get only attributes we need
df = df[['id', 
    'in_reply_to_status_id',
    'in_reply_to_user_id',
    'timestamp_ms',
    'user_id',
    'text']]

In [9]:
#drop invalid reply
df = drop_invalid_reply(df)

In [10]:
df_copy = df[['id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'user_id']].copy()
df_ori_tweet = df_copy[df_copy['in_reply_to_status_id'].isna()]
#df_test: first 10000 tweets
# df_test = df_copy.iloc[0:10000].copy()
# df_ori_test = df_test[df_test['in_reply_to_status_id'].isna()]

In [11]:
out = df_ori_tweet.merge(df_copy, left_on='id', right_on='in_reply_to_status_id')

In [12]:
#rename the column
out.columns = [
    'ori_tweet_id',
    'in_reply_to_status_id',
    'in_reply_to_user_id',
    'first_person_id',
    'second_level_tw_id',
    'in_reply_to_status_id_2',
    'in_reply_to_user_id_2',
    'second_person_id'
]

In [13]:
#filter out only the neccesarry attributes 
df_out = out[[
    'ori_tweet_id',
    'first_person_id',
    'second_level_tw_id',
    'second_person_id'
]]

In [15]:
#filter only one that have the at least one person in 2 persons is an airline
def filter_non_airlines_conversation(lst : List[str], df : pd.DataFrame, airlines_id : Dict) -> pd.DataFrame:
    """
    Drop conversations that don't have at least 1 person is an airline account
    
    Parameters
    ----------
    lst : a list of string name of 2 attributes. For example: ['first_person_id', 'second_person_id']
    df : DataFrame contains conversation
    airlines_id : the dictionary of airlines. Example {'KLM' : 12231421}. We can import from definitions folder
    
    Returns
    ----------
    df : cleaned df
    """
    df_out = df.copy()
    bol_lst = df_out[lst[0]].isin(airlines_id.values()) | (df_out[lst[1]].isin(airlines_id.values()))
    df_out = df_out[bol_lst]
    return df_out

In [None]:
filter_non_airlines_conversation(lst=['first_person_id', 'second_person_id'], df_out)

In [222]:
#build 2 new attributes to identify conversations below to which airlines 
def add_airlines_id(df : pd.DataFrame, lst : List[str]) -> pd.DataFrame:
    """
    add the name of the airlines and its id to the conversations 
    
    Parameters:
    -----------
    df : dataframe
    lst : list name of 2 attributes
    
    Returns:
    ----------
    df with an attribute indicating which airline and an attribute indicating that airline's id
    """
    df_out = df.copy()
    airline_lst = []
    airline_name_lst = []
    for index, row in df_out.iterrows():
        try:
            airline_name_lst.append(airlines_name[row[lst[0]]])
            airline_lst.append(row[lst[0]])
        except:
            airline_name_lst.append(airlines_name[row[lst[1]]])
            airline_lst.append(row[lst[1]])
    df_out['airline_id'] = airline_lst
    df_out['airline_name'] = airline_name_lst
    
    return df_out

In [16]:
#recursive function
def recur(looking_tweet_id : int, looking_user_id : int, df):
    dct = {}
    df_copy = df[
        (df['in_reply_to_status_id'] == looking_tweet_id) & 
        (df['user_id'] == looking_user_id)
    ].copy()
    
    if df_copy.empty:
        return 'none'
    for index, row in df_copy.iterrows():
        dct[looking_tweet_id] = recur(row['id'], row['in_reply_to_user_id'], df)
    return dct

In [17]:
start = time.time()
lst = []
for index in range(len(df_out)):
    lst.append(recur(df_out.iloc[index]['ori_tweet_id'], df_out.iloc[index]['second_person_id'], df_copy))

end = time.time()
print(end - start)

84.49556064605713


In [18]:
a = lst[0]
for x in a.values():
    print(x)

{1.1311759953215201e+18: {1.131180010230952e+18: {1.1313164357483602e+18: {1.1313199658549658e+18: 'none'}}}}


#### 3. Another solution

In [86]:
df_data = df[[
    'id',
    'user_id',
    'in_reply_to_status_id',
    'in_reply_to_user_id',
]]

In [95]:
#drop invalid reply
df_data = drop_invalid_reply(df_data)

In [96]:
test = df_data.merge(df_data, left_on=['id'], right_on=['in_reply_to_status_id'], suffixes=['_first', '_second'])

In [97]:
test.drop(columns=['in_reply_to_status_id_second', 'in_reply_to_user_id_second'], inplace=True)

In [90]:
test = filter_non_airlines_conversation(['user_id_first', 'user_id_second'], test, airlines_id)

In [91]:
test = test[sorted(test.columns)].head()

In [92]:
test_2 = test.merge(df_data,
           left_on=['id_second', 'user_id_second'],
           right_on=['in_reply_to_status_id', 'in_reply_to_user_id'],
           how='left')

In [93]:
test_2[sorted(test_2.columns)].head();

In [94]:
test_3 = test_2.merge(df_data,
           left_on=['id_second', 'user_id_second'],
           right_on=['in_reply_to_status_id', 'in_reply_to_user_id'],
           how='left')

In [138]:
test_3[sorted(test_3.columns)].head()

Unnamed: 0,id_first,id_second,id_x,id_y,in_reply_to_status_id_first,in_reply_to_status_id_x,in_reply_to_status_id_y,in_reply_to_user_id_first,in_reply_to_user_id_x,in_reply_to_user_id_y,user_id_first,user_id_second,user_id_x,user_id_y
0,1.131173e+18,1.131176e+18,1.13118e+18,1.13118e+18,,1.131176e+18,1.131176e+18,,18332190.0,18332190.0,1662186764,18332190,1662187000.0,1662187000.0
1,1.131173e+18,1.131174e+18,,,1.131101e+18,,,38676900.0,,,19610469,38676903,,
2,1.131173e+18,1.131181e+18,,,,,,1542863000.0,,,21404541,1542862735,,
3,1.131173e+18,1.131191e+18,1.131195e+18,1.131195e+18,1.131113e+18,1.131191e+18,1.131191e+18,40830700.0,40830700.0,40830700.0,18332190,40830697,18332190.0,18332190.0
4,1.131174e+18,1.131183e+18,1.131185e+18,1.131185e+18,1.131171e+18,1.131183e+18,1.131183e+18,3110269000.0,3110269000.0,3110269000.0,56377143,3110268790,56377140.0,56377140.0


#### 3.1 Testing

In [13]:
df_data.iloc[0].user_id in airlines_id.values()

False

In [14]:
lst = [x for x in test[test.in_reply_to_status_id_first.isna()].id_first]

In [15]:
test.head()

Unnamed: 0,id_first,user_id_first,in_reply_to_status_id_first,in_reply_to_user_id_first,id_second,user_id_second
0,1.131173e+18,1662186764,,,1.131176e+18,18332190
1,1.131173e+18,19610469,1.131101e+18,38676900.0,1.131174e+18,38676903
5,1.131173e+18,18332190,1.131113e+18,40830700.0,1.131191e+18,40830697
6,1.131174e+18,56377143,1.131171e+18,3110269000.0,1.131183e+18,3110268790
7,1.131174e+18,18332190,1.131129e+18,302493700.0,1.131298e+18,302493679


In [16]:
lst = [x for x in test[test.in_reply_to_user_id_first == test.user_id_second].id_first]
test[test['id_first'].isin(lst)].head()

Unnamed: 0,id_first,user_id_first,in_reply_to_status_id_first,in_reply_to_user_id_first,id_second,user_id_second
1,1.131173e+18,19610469,1.131101e+18,38676900.0,1.131174e+18,38676903
5,1.131173e+18,18332190,1.131113e+18,40830700.0,1.131191e+18,40830697
6,1.131174e+18,56377143,1.131171e+18,3110269000.0,1.131183e+18,3110268790
7,1.131174e+18,18332190,1.131129e+18,302493700.0,1.131298e+18,302493679
8,1.131174e+18,131494378,1.131088e+18,38676900.0,1.131175e+18,38676903


In [95]:
a = test[['id_first', 'id_second']].groupby('id_first').count().reset_index()
list(a[
    (a['id_second'] > 1) &
    (a['id_first'].isin(airlines_id.values()))
].id_first)


[]

#### 3.2 Not testing anymore

In [251]:
def extract_conversation(df: pd.DataFrame, max_level=7) -> pd.DataFrame:
    df_copy = df.copy()
    
    #initialize the conversation dataframe
    out = df_copy.merge(df_data, left_on=['id'], right_on=['in_reply_to_status_id'], suffixes=['_1', '_2'])
    out = out.drop(columns=[x for x in out.columns[-2:]])
    conversation_level = 3
    
    #filter out non airlines conversation 
    out = filter_non_airlines_conversation(['user_id_1', 'user_id_2'], out, airlines_id)
    
    #filter out only conversations start from an original tweet or an airlines-crossroads tweet
    or_lst = [tw_id for tw_id in out[out.in_reply_to_status_id_1.isna()].id_1]  # get tweets id of original one
    cr_lst = [tw_id for tw_id in out[out.in_reply_to_user_id_1 == out.user_id_2].id_1] # get tweet airlines-crossroads tweet
    out = out[out['id_1'].isin(set(or_lst).union(set(cr_lst)))]
    
    # extract only the reply tweets to map to original
    df_reply = df_copy.dropna(subset=['in_reply_to_status_id', 'in_reply_to_user_id']) 
    df_reply = df_reply.drop_duplicates(subset=['in_reply_to_status_id', 'user_id'])
    
    #map tweets together into a conversation 
    while not df_copy[df.columns[-1]].isnull().all() and conversation_level <= max_level:
        out = out.merge(df_reply,
           left_on=['id_' + str(conversation_level - 1), 'user_id_' + str(conversation_level - 2)],
           right_on=['in_reply_to_status_id', 'user_id'],
           how='left')
        out = out.drop(columns=[x for x in out.columns[-2:]])
        out = out.rename(columns={'id' : 'id_' + str(conversation_level), 'user_id' : 'user_id_' + str(conversation_level)})
        conversation_level += 1
        print(conversation_level)
    
    #clean the final table 
    out['reply'] = out['in_reply_to_status_id_1'].notna()
    out = out.drop(columns=['in_reply_to_status_id_1', 'in_reply_to_user_id_1'])
    out = add_airlines_id(out, lst=['user_id_1', 'user_id_2'])
    
    #filter out the ones have only 2 tweets involve
    out = out.dropna(subset=['id_4'])
    
    # add a conversation_opener attributes
    lst = []
    for x in out.user_id_1.isin(airlines_id.values()):
        if x:
            lst.append('airline')
        else:
            lst.append('customer')
    out['conversation_opener'] = lst
    
    return out[sorted(out.columns)]


In [252]:
a.shape

(2462, 17)

In [254]:
a = extract_conversation(df_data, max_level=7)
# a[a.user_id_7.notna()]
a

4
5
6
7
8


Unnamed: 0,airline_id,airline_name,conversation_opener,id_1,id_2,id_3,id_4,id_5,id_6,id_7,reply,user_id_1,user_id_2,user_id_3,user_id_4,user_id_5,user_id_6,user_id_7
0,18332190,British_Airways,customer,1.131173e+18,1.131176e+18,1.131180e+18,1.131316e+18,1.131320e+18,1.131322e+18,,False,1662186764,18332190,1.662187e+09,1.833219e+07,1.662187e+09,18332190.0,
11,18332190,British_Airways,airline,1.131176e+18,1.131180e+18,1.131316e+18,1.131320e+18,1.131322e+18,,,True,18332190,1662186764,1.833219e+07,1.662187e+09,1.833219e+07,,
19,218730857,Qantas,customer,1.131177e+18,1.131180e+18,1.131187e+18,1.131187e+18,1.131188e+18,1.131192e+18,1.131194e+18,True,226833772,218730857,2.268338e+08,2.187309e+08,2.268338e+08,218730857.0,226833772.0
20,20626359,VirginAtlantic,customer,1.131177e+18,1.131183e+18,1.131185e+18,1.131193e+18,,,,False,22672731,20626359,2.267273e+07,2.062636e+07,,,
23,22536055,AmericanAir,customer,1.131178e+18,1.131186e+18,1.131384e+18,1.131387e+18,1.131571e+18,,,False,901094791093653504,22536055,9.010948e+17,2.253606e+07,9.010948e+17,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16326,18332190,British_Airways,customer,1.135087e+18,1.135091e+18,1.135092e+18,1.135097e+18,1.135098e+18,,,True,1143830054,18332190,1.143830e+09,1.833219e+07,1.143830e+09,,
16330,18332190,British_Airways,airline,1.135089e+18,1.135089e+18,1.135092e+18,1.135094e+18,,,,True,18332190,949689445539831808,1.833219e+07,9.496894e+17,,,
16333,38676903,easyJet,customer,1.135090e+18,1.135097e+18,1.135098e+18,1.135100e+18,,,,True,143842056,38676903,1.438421e+08,3.867690e+07,,,
16336,18332190,British_Airways,airline,1.135091e+18,1.135092e+18,1.135097e+18,1.135098e+18,,,,True,18332190,1143830054,1.833219e+07,1.143830e+09,,,


In [249]:
lst = []
for x in a.user_id_1.isin(airlines_id.values()):
    if x:
        lst.append('airline')
    else:
        lst.append('customer')

In [None]:
#TODO: remove the tweet id that is present in previous columns already

In [1]:
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
other = pd.DataFrame({'key': ['a', 'b', 'c', 'd'],
                      'B': ['B0', 'B1', 'B2', 'B3']})

NameError: name 'pd' is not defined

In [None]:
df.merge(df_data, left_on)