In [1]:
import numpy as np
import pandas as pd 
import sqlite3 
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from Tools.conversation import start_end_customers_filter

#### 1. get the df from server 

In [2]:
con = sqlite3.connect('../../database/database(620).db')

In [3]:
# get the conversation df
query = """
    SELECT c.id_1, c.id_2, c.id_3, c.id_4, c.id_5, c.id_6, c.id_7, c.conversation_opener, c.airline_id, c.airline_name,
    g.timestamp_ms
    FROM conversation as c, general_tweets as g
    WHERE c.id_1 == g.id
"""
df = pd.read_sql_query(query, con)

In [4]:
df = df.replace('', np.nan)
df['timestamp_ms'] = pd.to_datetime(df.timestamp_ms)

In [5]:
# get the text df
query2 = "SELECT id, text FROM general_tweets"
df_text = pd.read_sql_query(query2, con)

In [6]:
# get the sentiment score df 
query3 = "SELECT id, sentiment_score FROM sentiment_table"
df_sentiment = pd.read_sql_query(query3, con)

#### 2. Mapping text to tweets in conversation

In [7]:
lst = ['id_1', 'id_2', 'id_3', 'id_4', 'id_5', 'id_6', 'id_7']
suffixes = ['_0', '_1', '_2', '_3', '_4', '_5', '_6' , '_7']
for idx in range(len(lst)):
    df = df.merge(right=df_text, left_on=lst[idx], right_on='id', how='left', suffixes=(suffixes[idx], suffixes[idx + 1]))
    df.drop(columns='id', inplace=True)

In [8]:
" ".join(pd.Series([df.iloc[0].text_1, df.iloc[0].text_2]))

"Thanks @British_Airways I really needed the extra 2 hour delay on the flight today with everyone sat on board... @sunriseblade That certainly is annoying. Hopefully, you're on your way now and we're sorry for any inconvenience caused. We hope you enjoy the rest of your day! Liz"

In [9]:
# loop through conversation (row) to get all text in conversation from customer
lst_full_text = []
for idx, row in df.iterrows():
    if row.conversation_opener == 'customer':
        text_lst = pd.Series([row.text_1, row.text_3, row.text_5, row.text]).dropna()
        full_text = " ".join(text_lst)
    else:
        text_lst = pd.Series([row.text_2, row.text_4, row.text_6]).dropna()
        full_text = " ".join(text_lst)
    lst_full_text.append(full_text)

In [10]:
df['all_text'] = lst_full_text

#### 3. get sentiment score change 

In [11]:
df = pd.concat([start_end_customers_filter(df), 
                df[['conversation_opener', 'timestamp_ms', 'all_text', 'airline_name', 'airline_id']].reset_index(drop=True)], 
               axis=1)

In [12]:
# fix the datatype
df['timestamp_ms'] = pd.to_datetime(df.timestamp_ms)
df_sentiment = df_sentiment.astype({
    'sentiment_score' : 'float64'
})

In [13]:
# get the sentiment score for open
df = df.merge(df_sentiment, left_on='open', right_on='id').drop(columns='id')
df.shape

(63707, 8)

In [14]:
# get the sentiment score for close 
df = df.merge(df_sentiment, left_on='close', right_on='id').drop(columns='id')
df.shape

(62015, 9)

In [15]:
df['sentiment_change'] = df['sentiment_score_y'] - df['sentiment_score_x']

In [16]:
df.as

Unnamed: 0,open,close,conversation_opener,timestamp_ms,all_text,airline_name,airline_id,sentiment_score_x,sentiment_score_y,sentiment_change
0,1.1311767275755356e+18,1.1311937151442656e+18,customer,2019-05-22 12:35:22.758,@Qantas On The qantas website @Qantas How do I...,Qantas,218730857,-0.080096,-0.976197,-0.896101
1,1.131186501868372e+18,1.1311937151442656e+18,airline,2019-05-22 12:47:00.229,@Qantas How do I know I won't be ignored again...,Qantas,218730857,-0.617716,-0.976197,-0.358481
2,1.131186501868372e+18,1.1311937151442656e+18,customer,2019-05-22 13:14:13.131,@Qantas How do I know I won't be ignored again...,Qantas,218730857,-0.617716,-0.976197,-0.358481
3,1.131187900576518e+18,1.1311937151442656e+18,airline,2019-05-22 13:17:17.506,@Qantas And last time I gave this info... noth...,Qantas,218730857,-0.890473,-0.976197,-0.085724
4,1.1311768612104316e+18,1.1311854191634472e+18,customer,2019-05-22 12:35:54.619,When you pay for bubble seats 11m in advance o...,VirginAtlantic,20626359,-0.454536,-0.382206,0.07233
