In [1]:
import pandas as pd 
import numpy as np 
from typing import Dict, List, Tuple
import os 
import sqlite3
import time 
from tools.cleaning import remove_duplicated_text, drop_invalid_reply

In [4]:
#path
path_database = '../database/database_18.db'
path_pickle = '../pickle_files/everything_tweets.p'

In [3]:
#connect to the database 
con = sqlite3.connect(path_database)
cur = con.cursor()

In [4]:
#pull data
query = '''
    SELECT id, 
    in_reply_to_status_id,
    in_reply_to_user_id,
    timestamp_ms,
    user_id,
    text
    FROM general_tweets
'''
df = pd.read_sql_query(query, con)

In [5]:
#fix type for timestamp object
df['timestamp_ms'] = pd.to_datetime(df.timestamp_ms)

In [5]:
#option 2: pull data from pickle file
df = pd.read_pickle(path_pickle)

#### 2. Define new conversation  

In [4]:
class Conversation:
    """ Represent a conversation
    """
    def __init__(
        self, 
        person1_id : int,
        person2_id : int,
        lst_tweets_id: List[int],
        airline : str
    ):
        """initialize a person
        :parameter user id of person 1, user id of person 2, a list of id a tweets in the conversation,
        name of the airline this conversation belong to
        """
        self.person1_id = person1_id
        self.person2_id = person2_id
        self.lst_tweets_id = lst_tweets_id
        self.airline = airline 
        
#     def extract_conversation(df : pd.DataFrame) -> List[Conversation]:
        

In [6]:
#get only attributes we need
df = df[['id', 
    'in_reply_to_status_id',
    'in_reply_to_user_id',
    'timestamp_ms',
    'user_id',
    'text']]

In [7]:
#drop invalid reply
df = drop_invalid_reply(df)

In [10]:
df_copy = df[['id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'user_id']].copy()
df_ori_tweet = df_copy[df_copy['in_reply_to_status_id'].isna()]
#df_test: first 10000 tweets
# df_test = df_copy.iloc[0:10000].copy()
# df_ori_test = df_test[df_test['in_reply_to_status_id'].isna()]

In [42]:
out = df_ori_tweet.merge(df_copy, left_on='id', right_on='in_reply_to_status_id')

In [43]:
out.columns = [
    'ori_tweet_id',
    'in_reply_to_status_id',
    'in_reply_to_user_id',
    'first_person_id',
    'second_level_tw_id',
    'in_reply_to_status_id_2',
    'in_reply_to_user_id_2',
    'second_person_id'
]

In [40]:
out = out.set_index(['ori_tweet_id', 'second_person_id', 'second_level_tw_id'])

In [34]:
out_2 = out.merge(df_copy, left_on=['second_level_tw_id', 'first_person_id'], right_on=['in_reply_to_status_id', 'user_id'])

In [39]:
out_2.set_index(['ori_tweet_id', 'second_level_tw_id', 'id']);

In [44]:
def recur(looking_tweet_id : int, looking_user_id : int, df):
    dct = {}
    df_copy = df[
        (df['in_reply_to_status_id'] == looking_tweet_id) & 
        (df['user_id'] == looking_user_id)
    ].copy()
    
    if df_copy.empty:
        return dct
    
    for index, row in df_copy.iterrows():
        dct[looking_tweet_id] = recur(row['id'], row['in_reply_to_status_id'], df)
    return dct

In [45]:
start = time.time()
lst = []
for index in range(len(out)):
    lst.append(recur(out.iloc[index]['second_level_tw_id'], out.iloc[index]['in_reply_to_user_id_2'], df_copy))

end = time.time()
print(end - start)
lst

90.66311836242676


[{1.1311759953215201e+18: {}},
 {1.1311761586477179e+18: {}},
 {},
 {},
 {},
 {},
 {},
 {1.1311825841662689e+18: {}},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {1.1311862518586327e+18: {}},
 {},
 {1.1311839361120707e+18: {}},
 {},
 {},
 {},
 {},
 {},
 {1.1311795259440865e+18: {}},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {1.1311828604786852e+18: {}},
 {},
 {1.1311824664824914e+18: {}},
 {},
 {},
 {},
 {1.131184130937512e+18: {}},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {1.1311882878031421e+18: {}},
 {},
 {1.1314788227156337e+18: {}},
 {},
 {1.1311895348788183e+18: {}},
 {1.1311899270001254e+18: {}},
 {1.1311929144727757e+18: {}},
 {},
 {},
 {},
 {},
 {1.1312100893717094e+18: {}},
 {1.1312350274324972e+18: {}},
 {},
 {},
 {},
 {1.1311971641122202e+18: {}},
 {1.1312135897575178e+18: {}},
 {},
 {},
 {},
 {},
 {},
 {},
 {1.1312332540764979e+18: {}},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {

In [46]:
pd.Series(lst)

0        {1.1311759953215201e+18: {}}
1        {1.1311761586477179e+18: {}}
2                                  {}
3                                  {}
4                                  {}
                     ...             
15215    {1.1350974558534124e+18: {}}
15216                              {}
15217                              {}
15218                              {}
15219                              {}
Length: 15220, dtype: object