In [135]:
import numpy as np 
import pandas as pd 
import sqlite3
import os 
import ast
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from Tools.conversation import start_end_customers_filter

In [136]:
con = sqlite3.connect('../../database/database(620).db')

#### 1. Get geo, time, sentiment score table 

In [137]:
query1 = "SELECT id, place, timestamp_ms FROM general_tweets"
df1 = pd.read_sql_query(query1, con)

In [139]:
# get df
df_geo = df1[['id', 'place']].copy().replace('', np.nan)
df_time = df1[['id', 'timestamp_ms']].copy().replace('', np.nan)
df1 = []

In [140]:
query2 = "SELECT id, sentiment_score FROM sentiment_table"
df_sentiment = pd.read_sql_query(query2, con)

#### 2. Get conversation table 

In [141]:
query3 = """
    SELECT c.id_1, c.id_2, c.id_3, c.id_4, c.id_5, c.id_6, c.id_7, 
    c.airline_id, c.airline_name, c.conversation_opener, g.timestamp_ms
    FROM conversation as c, general_tweets as g
    WHERE g.id == c.id_1
"""
df3 = pd.read_sql_query(query3, con)

In [143]:
df_KLM = df3[df3.airline_name == 'KLM'].copy().replace('', np.nan)

In [144]:
lst = ['id_1', 'id_2', 'id_3', 'id_4', 'id_5', 'id_6', 'id_7']
suffixes = ['_0', '_1', '_2', '_3', '_4', '_5', '_6' , '_7']
for idx in range(len(lst)):
    df_KLM = df_KLM.merge(right=df_geo, left_on=lst[idx], right_on='id', how='left', suffixes=(suffixes[idx], suffixes[idx + 1]))
    df_KLM.drop(columns='id', inplace=True)

#### 3. Extract general geo for a conversation 

In [145]:
# tweets by KLM always don't have geo information
(df_KLM[df_KLM.conversation_opener=='airline'].place_1.isna().all(), 
df_KLM[df_KLM.conversation_opener=='airline'].place_3.isna().all())

(True, True)

In [146]:
# loop through each row
lst_geo = []
for index, row in df_KLM.iterrows():
    geo_s = pd.Series([
        row.place_1,
        row.place_2,
        row.place_3,
        row.place_4,
        row.place_5,
        row.place_6,
        row.place
    ])
    if geo_s.isna().all():
        geo = np.nan
    else:
        geo = ast.literal_eval(geo_s.dropna().iloc[0])
    lst_geo.append(geo)

In [147]:
# drop an necessary columns and add geo attribute
df_KLM.drop(columns=df_KLM.columns[11:], inplace=True)
df_KLM['place'] = lst_geo 

In [148]:
df_KLM = df_KLM.dropna(subset='place')
df_KLM.shape # god damn we only have 460 datapoints

(460, 12)

#### 4. Get sentiment score change 

In [175]:
df = pd.concat([start_end_customers_filter(df_KLM), 
                df_KLM[['conversation_opener', 'timestamp_ms', 'place']].reset_index(drop=True)], 
               axis=1)

In [176]:
# fix the datatype
df['timestamp_ms'] = pd.to_datetime(df.timestamp_ms)
df_sentiment = df_sentiment.astype({
    'sentiment_score' : 'float64'
})

In [177]:
# get the sentiment score for open
df = df.merge(df_sentiment, left_on='open', right_on='id').drop(columns='id')
df.shape

(312, 6)

In [178]:
# get the sentiment score for close 
df = df.merge(df_sentiment, left_on='close', right_on='id').drop(columns='id')
df.shape

(294, 7)

In [179]:
df['sentiment_change'] = df['sentiment_score_y'] - df['sentiment_score_x']

In [181]:
df[['timestamp_ms', 'place', 'sentiment_change']]

Unnamed: 0,timestamp_ms,place,sentiment_change
0,2019-05-27 09:47:13.179,"{'id': '9f659d51e5c5deae', 'url': 'https://api...",-0.830383
1,2019-05-27 09:57:58.071,"{'id': '9f659d51e5c5deae', 'url': 'https://api...",-0.303868
2,2019-05-27 10:04:50.567,"{'id': '9f659d51e5c5deae', 'url': 'https://api...",-0.303868
3,2019-05-30 07:34:03.209,"{'id': 'cd003ebe3a96fcc6', 'url': 'https://api...",1.060274
4,2019-05-30 13:01:59.978,"{'id': '178a87b8e2eaa375', 'url': 'https://api...",-0.238914
...,...,...,...
289,2020-03-03 07:30:42.909,"{'id': 'cd003ebe3a96fcc6', 'url': 'https://api...",0.584146
290,2020-03-05 07:26:29.002,"{'id': 'cd003ebe3a96fcc6', 'url': 'https://api...",0.584146
291,2020-03-05 10:46:08.073,"{'id': '97bcdfca1a2dca59', 'url': 'https://api...",1.788668
292,2020-03-10 14:59:16.494,"{'id': '521b6f591c4b3ce8', 'url': 'https://api...",-0.405916
