In [52]:
import pandas as pd 
import numpy as np 
from typing import Dict, List, Tuple
import os 
import sqlite3
import time 
from Tools.Cleaning import remove_duplicated_text, drop_invalid_reply, filter_non_airlines_conversation, add_airlines_id, change_type
from definitions.airlines import airlines_id, airlines_name
import re

In [53]:
#path
path_database = '../database/database.db'
path_pickle = '../pickle_files/everything_tweets.p'

In [54]:
#connect to the database 
con = sqlite3.connect(path_database)
cur = con.cursor()

In [None]:
#pull data
query = '''
    SELECT id, 
    in_reply_to_status_id,
    in_reply_to_user_id,
    timestamp_ms,
    user_id,
    text
    FROM general_tweets
'''
df = pd.read_sql_query(query, con)

In [55]:
#fix type for timestamp object
df['timestamp_ms'] = pd.to_datetime(df.timestamp_ms)

In [56]:
#option 2: pull data from pickle file
df = pd.read_pickle(path_pickle)

#### 2. Define new conversation  

In [57]:
class Conversation:
    """ Represent a conversation
    """
    def __init__(
        self, 
        person1_id : int,
        person2_id : int,
        lst_tweets_id: List[int],
        airline : str
    ):
        """initialize a person
        :parameter user id of person 1, user id of person 2, a list of id a tweets in the conversation,
        name of the airline this conversation belong to
        """
        self.person1_id = person1_id
        self.person2_id = person2_id
        self.lst_tweets_id = lst_tweets_id
        self.airline = airline 
        
#     def extract_conversation(df : pd.DataFrame) -> List[Conversation]:
        

#### 3. Extract conversation

In [58]:
df = df.replace(to_replace='', value=np.NaN)

In [59]:
df_data = df[[
    'id',
    'user_id',
    'in_reply_to_status_id',
    'in_reply_to_user_id',
]]

In [60]:
df_data = df_data.astype({
    'id' : 'float64',
    'user_id' : 'float64',
    'in_reply_to_status_id' : 'float64',
    'in_reply_to_user_id' : 'float64'
})

df_data = df_data.astype({
    'user_id' : 'Int64'
})

In [61]:
#drop invalid reply
df_data = drop_invalid_reply(df_data)

In [62]:
df_data[df_data.user_id.isna()]

Unnamed: 0,id,user_id,in_reply_to_status_id,in_reply_to_user_id


In [63]:
def extract_conversation(df: pd.DataFrame, max_level=7) -> pd.DataFrame:
    df_copy = df[[
        'id',
        'user_id',
        'in_reply_to_status_id',
        'in_reply_to_user_id',
    ]].copy()
    
    #initialize the conversation dataframe
    out = df_copy.merge(df_data, left_on=['id'], right_on=['in_reply_to_status_id'], suffixes=['_1', '_2'])
    out = out.drop(columns=[x for x in out.columns[-2:]])
    conversation_level = 3
    
    #filter out non airlines conversation 
    out = filter_non_airlines_conversation(['user_id_1', 'user_id_2'], out, airlines_id)
    
    #filter out only conversations start from an original tweet or an airlines-crossroads tweet
    or_lst = [tw_id for tw_id in out[out.in_reply_to_status_id_1.isna()].id_1]  # get tweets id of original one
    cr_lst = [tw_id for tw_id in out[out.in_reply_to_user_id_1 == out.user_id_2].id_1] # get tweet airlines-crossroads tweet
    out = out[out['id_1'].isin(set(or_lst).union(set(cr_lst)))]
    
    # extract only the reply tweets to map to original
    df_reply = df_copy.dropna(subset=['in_reply_to_status_id', 'in_reply_to_user_id']) 
    df_reply = df_reply.drop_duplicates(subset=['in_reply_to_status_id', 'user_id'])
    
    #map tweets together into a conversation 
    while not df_copy[df.columns[-1]].isnull().all() and conversation_level <= max_level:
        out = out.merge(df_reply,
           left_on=['id_' + str(conversation_level - 1), 'user_id_' + str(conversation_level - 2)],
           right_on=['in_reply_to_status_id', 'user_id'],
           how='left')
        out = out.drop(columns=[x for x in out.columns[-2:]])
        out = out.rename(columns={'id' : 'id_' + str(conversation_level), 'user_id' : 'user_id_' + str(conversation_level)})
        conversation_level += 1
        print(conversation_level)
    
    #clean the final table 
    out['reply'] = out['in_reply_to_status_id_1'].notna()
    out = out.drop(columns=['in_reply_to_status_id_1', 'in_reply_to_user_id_1'])
    out = add_airlines_id(out, lst=['user_id_1', 'user_id_2'])
    
    #filter out the ones have only 2 tweets involve
    out = out.dropna(subset=['id_4'])
    
    # add a conversation_opener attributes
    lst = []
    for x in out.user_id_1.isin(airlines_id.values()):
        if x:
            lst.append('airline')
        else:
            lst.append('customer')
    out['conversation_opener'] = lst
    
    return out[sorted(out.columns)]


In [64]:
# Extract conversations
conversation = extract_conversation(df_data, max_level=7)

4
5
6
7
8


In [65]:
conversation

Unnamed: 0,airline_id,airline_name,conversation_opener,id_1,id_2,id_3,id_4,id_5,id_6,id_7,reply,user_id_1,user_id_2,user_id_3,user_id_4,user_id_5,user_id_6,user_id_7
0,18332190,British_Airways,customer,1.131173e+18,1.131176e+18,1.131180e+18,1.131316e+18,1.131320e+18,1.131322e+18,,False,1662186764,18332190,1662186764,18332190,1662186764,18332190,
11,18332190,British_Airways,airline,1.131176e+18,1.131180e+18,1.131316e+18,1.131320e+18,1.131322e+18,,,True,18332190,1662186764,18332190,1662186764,18332190,,
19,218730857,Qantas,customer,1.131177e+18,1.131180e+18,1.131187e+18,1.131187e+18,1.131188e+18,1.131192e+18,1.131194e+18,True,226833772,218730857,226833772,218730857,226833772,218730857,226833772
20,20626359,VirginAtlantic,customer,1.131177e+18,1.131183e+18,1.131185e+18,1.131193e+18,,,,False,22672731,20626359,22672731,20626359,,,
23,22536055,AmericanAir,customer,1.131178e+18,1.131186e+18,1.131384e+18,1.131387e+18,1.131571e+18,,,False,901094791093653504,22536055,901094791093653504,22536055,901094791093653504,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16326,18332190,British_Airways,customer,1.135087e+18,1.135091e+18,1.135092e+18,1.135097e+18,1.135098e+18,,,True,1143830054,18332190,1143830054,18332190,1143830054,,
16330,18332190,British_Airways,airline,1.135089e+18,1.135089e+18,1.135092e+18,1.135094e+18,,,,True,18332190,949689445539831808,18332190,949689445539831808,,,
16333,38676903,easyJet,customer,1.135090e+18,1.135097e+18,1.135098e+18,1.135100e+18,,,,True,143842056,38676903,143842056,38676903,,,
16336,18332190,British_Airways,airline,1.135091e+18,1.135092e+18,1.135097e+18,1.135098e+18,,,,True,18332190,1143830054,18332190,1143830054,,,


In [66]:
#extracting KLM only conversations
convo_KLM = conversation['airline_name'] == "KLM"
convo_snip1 = conversation[convo_KLM]
#counting rows of the extracted df
count_row_KLM = convo_snip1.shape[0]

#British Airways
convo_Brit = conversation['airline_name'] == "British_Airways"
convo_snip2 = conversation[convo_Brit]
count_row_Brit = convo_snip2.shape[0]

#Qantas
convo_Qantas = conversation['airline_name'] == "Qantas"
convo_snip3 = conversation[convo_Qantas]
count_row_Qantas = convo_snip3.shape[0]

#VirginAtlantic
convo_Virgin = conversation['airline_name'] == "VirginAtlantic"
convo_snip4 = conversation[convo_Virgin]
count_row_Virgin = convo_snip4.shape[0]

#AirFrance: 106062176
convo_France = conversation['airline_name'] == "AirFrance"
convo_snip5 = conversation[convo_France]
count_row_France = convo_snip5.shape[0]

#AmericanAir: 22536055
convo_American = conversation['airline_name'] == "AmericanAir"
convo_snip6 = conversation[convo_American]
count_row_American = convo_snip6.shape[0]

#Lufthansa: 124476322
convo_Luft = conversation['airline_name'] == "Lufthansa"
convo_snip7 = conversation[convo_Luft]
count_row_Luft = convo_snip7.shape[0]

#AirBerlin: 26223583
convo_Berlin = conversation['airline_name'] == "AirBerlin"
convo_snip8 = conversation[convo_Berlin]
count_row_Berlin = convo_snip8.shape[0]

#AirBerlin assist: 2182373406
convo_BerlinAss = conversation['airline_name'] == "AirBerlin assist"
convo_snip9 = conversation[convo_BerlinAss]
count_row_BerlinAss = convo_snip9.shape[0]

#easyJet: 38676903
convo_easy = conversation['airline_name'] == "easyJet"
convo_snip10 = conversation[convo_easy]
count_row_easy = convo_snip10.shape[0]

#RyanAir: 1542862735
convo_Ryan = conversation['airline_name'] == "RyanAir"
convo_snip11 = conversation[convo_Ryan]
count_row_Ryan = convo_snip11.shape[0]

#SingaporeAir: 253340062
convo_Sing = conversation['airline_name'] == "SingaporeAir"
convo_snip12 = conversation[convo_Sing]
count_row_Sing = convo_snip12.shape[0]

#EtihadAirways: 45621423
convo_Etihad = conversation['airline_name'] == "EtihadAirways"
convo_snip13 = conversation[convo_Etihad]
count_row_Etihad = convo_snip13.shape[0]

In [None]:
import matplotlib.pyplot as plt

In [None]:
# normal plot
airlines = ['KLM','British Airways','AirFrance','AmericanAir','Lufthansa',
    'AirBerlin','AirBerlin assist','easyJet','RyanAir','SingaporeAir',
    'Qantas','EtihadAirways','VirginAtlantic']
conversations = [count_row_KLM,count_row_Brit,count_row_France,count_row_American,count_row_Luft,
                 count_row_Berlin,count_row_BerlinAss,count_row_easy,count_row_Ryan,count_row_Sing,
                 count_row_Qantas,count_row_Etihad,count_row_Virgin]
plt.bar(airlines, conversations, width = 0.8)
plt.xlabel("Airline")
plt.xticks(rotation='vertical')
plt.ylabel("nr. of conversations");

In [None]:
airlines = ['KLM','British Airways','AirFrance','AmericanAir','Lufthansa',
    'AirBerlin','AirBerlin assist','easyJet','RyanAir','SingaporeAir',
    'Qantas','EtihadAirways','VirginAtlantic']
conversations = [count_row_KLM,count_row_Brit,count_row_France,count_row_American,count_row_Luft,
                 count_row_Berlin,count_row_BerlinAss,count_row_easy,count_row_Ryan,count_row_Sing,
                 count_row_Qantas,count_row_Etihad,count_row_Virgin]

#normal plot

bars = plt.barh(airlines,conversations)

#turn

for  bar in bars:
    width = bar.get_width()
    label_y = bar.get_y() + bar.get_height() / 2
    plt.text(width, label_y, s=f'{width}')

plt.xlabel("Nr. of conversations")
plt.ylabel("Airlines")
plt.title("Nr. of conversations per airline")
    
plt.show()

In [67]:
convo_snip=convo_snip1
convo_snip.reset_index()

Unnamed: 0,index,airline_id,airline_name,conversation_opener,id_1,id_2,id_3,id_4,id_5,id_6,id_7,reply,user_id_1,user_id_2,user_id_3,user_id_4,user_id_5,user_id_6,user_id_7
0,73,56377143,KLM,airline,1.131188e+18,1.131191e+18,1.131194e+18,1.131287e+18,1.131293e+18,1.131305e+18,,True,56377143,824207173425262592,56377143,824207173425262592,56377143,824207173425262592,
1,85,56377143,KLM,customer,1.131191e+18,1.131194e+18,1.131287e+18,1.131293e+18,1.131305e+18,,,True,824207173425262592,56377143,824207173425262592,56377143,824207173425262592,,
2,105,56377143,KLM,airline,1.131194e+18,1.131287e+18,1.131293e+18,1.131305e+18,,,,True,56377143,824207173425262592,56377143,824207173425262592,,,
3,454,56377143,KLM,customer,1.131277e+18,1.131283e+18,1.131284e+18,1.131296e+18,,,,False,2186639444,56377143,2186639444,56377143,,,
4,464,56377143,KLM,customer,1.131282e+18,1.131283e+18,1.131284e+18,1.131285e+18,1.131286e+18,1.131288e+18,,False,977096243204804608,56377143,977096243204804608,56377143,977096243204804608,56377143,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,15840,56377143,KLM,customer,1.134903e+18,1.134908e+18,1.134910e+18,1.134911e+18,1.134912e+18,,,False,19294325,56377143,19294325,56377143,19294325,,
170,15856,56377143,KLM,airline,1.134908e+18,1.134910e+18,1.134911e+18,1.134912e+18,,,,True,56377143,19294325,56377143,19294325,,,
171,16324,56377143,KLM,customer,1.135087e+18,1.135090e+18,1.135091e+18,1.135093e+18,,,,False,737654755,56377143,737654755,56377143,,,
172,16325,56377143,KLM,customer,1.135087e+18,1.135089e+18,1.135095e+18,1.135097e+18,,,,False,3337513277,56377143,3337513277,56377143,,,


In [157]:
#empty list
list = []

In [163]:
for row, column in convo_ids.iteritems():
    list.append(column)

In [164]:
list

[73       1.131188e+18
 85       1.131191e+18
 105      1.131194e+18
 454      1.131277e+18
 464      1.131282e+18
              ...     
 15840    1.134903e+18
 15856    1.134908e+18
 16324    1.135087e+18
 16325    1.135087e+18
 16349    1.135095e+18
 Name: id_1, Length: 174, dtype: float64,
 73       1.131191e+18
 85       1.131194e+18
 105      1.131287e+18
 454      1.131283e+18
 464      1.131283e+18
              ...     
 15840    1.134908e+18
 15856    1.134910e+18
 16324    1.135090e+18
 16325    1.135089e+18
 16349    1.135097e+18
 Name: id_2, Length: 174, dtype: float64,
 73       1.131194e+18
 85       1.131287e+18
 105      1.131293e+18
 454      1.131284e+18
 464      1.131284e+18
              ...     
 15840    1.134910e+18
 15856    1.134911e+18
 16324    1.135091e+18
 16325    1.135095e+18
 16349    1.135098e+18
 Name: id_3, Length: 174, dtype: float64,
 73       1.131287e+18
 85       1.131293e+18
 105      1.131305e+18
 454      1.131296e+18
 464      1.131285e+18


In [150]:
list = []

convo_ids = convo_snip.iloc[:,3:10]
for column in convo_ids:
    list.append(convo_ids[column].notnull().sum())

In [151]:
list, count_row_KLM

([174, 174, 174, 174, 70, 26, 9], 174)

In [138]:
from statistics import mean, mode

count_1 = count_row_KLM - list[0]
count_2 = count_row_KLM - list[1]
count_3 = count_row_KLM - list[2]
count_4 = count_row_KLM - list[3] - count_5 - count_6 - count_7
count_5 = count_row_KLM - list[4] - count_6 - count_7
count_6 = count_row_KLM - list[5] - count_7
count_7 = count_row_KLM - list[6]

length_convo = [count_4, count_5, count_6, count_7]
length_convo, mode(length_convo)

([69, 35, 17, 9], 69)

In [152]:
list = []

for i in convo_ids.itertuples():
    list.append(i)
    
list


[Pandas(Index=73, id_1=1.1311882878031421e+18, id_2=1.131191297312723e+18, id_3=1.1311944886034637e+18, id_4=1.13128715853756e+18, id_5=1.1312928137763021e+18, id_6=1.1313054572373197e+18, id_7=nan),
 Pandas(Index=85, id_1=1.131191297312723e+18, id_2=1.1311944886034637e+18, id_3=1.13128715853756e+18, id_4=1.1312928137763021e+18, id_5=1.1313054572373197e+18, id_6=nan, id_7=nan),
 Pandas(Index=105, id_1=1.1311944886034637e+18, id_2=1.13128715853756e+18, id_3=1.1312928137763021e+18, id_4=1.1313054572373197e+18, id_5=nan, id_6=nan, id_7=nan),
 Pandas(Index=454, id_1=1.131277493674156e+18, id_2=1.1312826816926106e+18, id_3=1.1312837116795085e+18, id_4=1.1312963914211533e+18, id_5=nan, id_6=nan, id_7=nan),
 Pandas(Index=464, id_1=1.1312816890518282e+18, id_2=1.1312833202669937e+18, id_3=1.1312839552301056e+18, id_4=1.1312853263858852e+18, id_5=1.1312858794509844e+18, id_6=1.1312878382160814e+18, id_7=nan),
 Pandas(Index=470, id_1=1.1312825512245862e+18, id_2=1.1318708414472888e+18, id_3=1.13