## Conversations (2)

In [16]:
import sqlite3
import pandas as pd
import sys 
from datetime import datetime

In [17]:
pd.set_option('display.max_colwidth', None)  

---

In [28]:
conn_tweets = sqlite3.connect('tweets_airlines.db')

---

In [29]:
def calculate_runtime(start, stop):

    runtime = stop - start
    
    days, seconds = runtime.days, runtime.seconds
    
    hours = days * 24 + seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = (seconds % 60)
    
    return 'Runtime of creating Conversation Structure was: {} h, {} m {} s'.format(hours, minutes, seconds)

In [30]:
start = datetime.now()

---

### Conversation Structure

In [31]:
query_conv = '''SELECT tweet_id, user_id, reply_to_tweet, created_at 
                FROM TWEETS
                LIMIT 10000
                '''
df_conversations = pd.read_sql(query_conv, conn_tweets)

In [32]:
query_tweets = '''SELECT *
                  FROM TWEETS
                  LIMIT 10000
                  '''
df_tweets = pd.read_sql(query_tweets, conn_tweets)

In [33]:
query_users = '''SELECT *
                 FROM USERS
                 LIMIT 10000'''

df_users = pd.read_sql(query_users, conn_tweets)

In [None]:
        df_conversations.at[index, 'conversation_level'] = df_conversations.iloc[lambda x: x.index == row['reply_to_tweet'], 2].values[0] + 1


---

#### Giving all non roots the correct level and conversation id

Conversation level at start is 0 for all. For every non root, the conversation level is then defined as the conversation level of the tweet it is a reply to, + 1. 
Conversation ID is the tweet_id of the root of the conversation

In [34]:
df_conversations = df_conversations.sort_values(by='created_at')
df_conversations['conversation_level'] = 1
df_conversations['conversation_id'] = df_conversations['tweet_id'].astype(str)

In [35]:
df_tweets.shape[0]

10000

In [36]:
roots = df_conversations['reply_to_tweet'] == 0
df_conversations.loc[roots, 'conversation_level'] = 0 

In [37]:
count = 0

for index, row in df_conversations[~roots].iterrows():
    
    count = count + 1
    
    parent_tweets = df_conversations.loc[df_conversations['tweet_id'] == row['reply_to_tweet']]
    
    if parent_tweets.size > 0:
        row_id = parent_tweets.iloc[0]['conversation_id']
        row_level = parent_tweets.iloc[0]['conversation_level'] + 1
        df_conversations.at[index,'conversation_level'] = row_level
        df_conversations.at[index,'conversation_id'] = row_id
    
    progress = '{count} / {total}'.format(count = count, total = (df_conversations[~roots].shape[0]))
    sys.stdout.write('\r' + progress)

3330 / 3330

In [38]:
df_conversations.shape[0]

10000

---

#### Defining all conversation ID's correctly for level 0s and 1s

For root tweets, all conversation ID's are their tweet id with added .0. 

For all first replies, a dictionary counts the amount of level ones appear under one conversation ID. To the conversation_id of the level ones the string .X will be added, where X is a counter of the amount of 1s that still appear for the conversation_id that don't have a correctly defined conversation_id yet. The branches dictionary gets updated everytime a level 1 conversation_id is correctly defined. 

In [39]:
branches_dict = df_conversations[df_conversations['conversation_level'] == 1].groupby('conversation_id')['conversation_level'].count().to_dict()

In [40]:
def conv_id(conversation_id, level):
    
    if level == 0: 
        return conversation_id + '.0'

    else:
        if level == 1:
            try:
                level_ones = branches_dict.get(conversation_id)

                update_levels = level_ones - 1

                if update_levels == 0:
                    branches_dict.pop(conversation_id)

                else: 
                    branches_dict[conversation_id] = update_levels

                if level_ones > 1: 
                    return conversation_id + '.' + str(level_ones)

                else:
                    return conversation_id + '.1'
            except:
                pass
        else:
            return 'level > 1'

In [41]:
df_conversations['conversation_id'] = df_conversations.apply(lambda tweet: conv_id(tweet.conversation_id, tweet.conversation_level), axis=1)

---

#### Correcting conversation ID of tweets with level > 1 

For all tweets above level one the conversation ID is copied from the tweet it is a reply to. 

In [42]:
count = 0

for index, row in df_conversations[df_conversations['conversation_level'] > 1].iterrows():
    
    count = count + 1
    
    parent_tweets = df_conversations.loc[df_conversations['tweet_id'] == row['reply_to_tweet']]
    
    if parent_tweets.size > 1:
        row_id = parent_tweets.iloc[0]['conversation_id']
        df_conversations.at[index,'conversation_id'] = row_id
        
    progress = '{count} / {total}'.format(count = count, total = (df_conversations[df_conversations['conversation_level'] > 1].shape[0]))
    sys.stdout.write('\r' + progress)

1122 / 1122

---

In [43]:
dict_levels = df_conversations.set_index('tweet_id')['conversation_level'].to_dict()

In [44]:
dict_id = df_conversations.set_index('tweet_id')['conversation_id'].to_dict()

In [45]:
df_tweets['conversation_level'] = df_tweets['tweet_id'].map(dict_levels)

In [46]:
df_tweets['conversation_id'] = df_tweets['tweet_id'].map(dict_id)

In [None]:
df_tweets.to_sql('TWEETS', conn_tweets, if_exists="replace", index=False)

In [217]:
conn_tweets.close()

---

In [148]:
stop = datetime.now()

In [149]:
calculate_runtime(start, stop)

'Runtime of creating Conversation Structure for  was: 0 h, 2 m 16 s'

---

In [3]:
conn_tweets = sqlite3.connect('tweets_airlines.db')

---

### Conversation Structure

In [3]:
query_conv = '''SELECT tweet_id, reply_to_tweet, created_at 
                FROM TWEETS
                '''
df_conversations = pd.read_sql(query_conv, conn_tweets)

In [4]:
query_tweets = '''SELECT *
                  FROM TWEETS
                  '''
df_tweets = pd.read_sql(query_tweets, conn_tweets)

In [5]:
query_users = '''SELECT *
                 FROM USERS
                 '''
df_users = pd.read_sql(query_users, conn_tweets)

---

#### Giving all non roots the correct level and conversation id

Conversation level at start is 0 for all. For every non root, the conversation level is then defined as the conversation level of the tweet it is a reply to, + 1. 
Conversation ID is the tweet_id of the root of the conversation

---

In [6]:
df_conversations = df_conversations.sort_values(by='created_at')
df_conversations['conversation_level'] = 1
df_conversations['conversation_id'] = df_conversations['tweet_id'].astype(str)

In [7]:
roots = df_conversations['reply_to_tweet'] == 0
df_conversations.loc[roots, 'conversation_level'] = 0 

In [8]:
df_conversations = df_conversations.set_index('tweet_id')

In [9]:
roots = df_conversations['reply_to_tweet'] == 0
df_conversations.loc[roots, 'conversation_level'] = 0 

---

Splitting dataframes to reduce the runtime

In [10]:
df_conversations_1 = df_conversations.head(2000000).copy()
df_conversations_2_1 = df_conversations.head(4000000).copy()
df_conversations_2 = df_conversations_2_1.tail(2000000).copy()
df_conversations_3 = df_conversations.tail(2094135).copy()

In [11]:
count = 0

for index, row in df_conversations_1.iterrows():
    
    count = count + 1
    try:
        df_conversations_1.at[index, 'conversation_level'] = df_conversations_1.iloc[lambda x: x.index == row['reply_to_tweet'], 2].values[0] + 1
        df_conversations_1.at[index, 'conversation_id'] = df_conversations_1.iloc[lambda x: x.index == row['reply_to_tweet'], 3].values[0]
    except:
        pass
    
    progress = '{count} / {total}'.format(count = count, total = (df_conversations_1.shape[0]))
    sys.stdout.write('\r' + progress)

2000000 / 2000000

In [12]:
count = 0

for index, row in df_conversations_2.iterrows():
    
    count = count + 1
    try:
        df_conversations_2.at[index, 'conversation_level'] = df_conversations_2.iloc[lambda x: x.index == row['reply_to_tweet'], 2].values[0] + 1
        df_conversations_2.at[index, 'conversation_id'] = df_conversations_2.iloc[lambda x: x.index == row['reply_to_tweet'], 3].values[0]
    except:
        pass
    
    progress = '{count} / {total}'.format(count = count, total = (df_conversations_2.shape[0]))
    sys.stdout.write('\r' + progress)

2000000 / 2000000

In [13]:
count = 0

for index, row in df_conversations_3.iterrows():
    
    count = count + 1
    try:
        df_conversations_3.at[index, 'conversation_level'] = df_conversations_3.iloc[lambda x: x.index == row['reply_to_tweet'], 2].values[0] + 1
        df_conversations_2.at[index, 'conversation_id'] = df_conversations_2.iloc[lambda x: x.index == row['reply_to_tweet'], 3].values[0]        
    except:
        pass
    
    progress = '{count} / {total}'.format(count = count, total = (df_conversations_3.shape[0]))
    sys.stdout.write('\r' + progress)

2094135 / 2094135

In [14]:
df_conversations = pd.concat([df_conversations_1, df_conversations_2, df_conversations_3])

---

#### Defining all conversation ID's correctly for level 0s and 1s

For root tweets, all conversation ID's are their tweet id with added .0. 

For all first replies, a dictionary counts the amount of level ones appear under one conversation ID. To the conversation_id of the level ones the string .X will be added, where X is a counter of the amount of 1s that still appear for the conversation_id that don't have a correctly defined conversation_id yet. The branches dictionary gets updated everytime a level 1 conversation_id is correctly defined. 

In [15]:
branches_dict = df_conversations[df_conversations['conversation_level'] == 1].groupby('conversation_id')['conversation_level'].count().to_dict()

In [16]:
def conv_id(conversation_id, level):
    """
    Updates conversation_id for every record. Takes the amount of level 1's per conversation_id and creates a unique code for every of them
    :param conversation_id: current conversation_id
    :param level: conversation_level 
    :return: updated conversation_id 
    """
    
    # All level 0 tweets' conversation_id get the addition .0
    if level == 0: 
        return conversation_id + '.0'

    else:
        # All level 1 tweets' conversation_id get a unique addition
        if level == 1:
            try:
                level_ones = branches_dict.get(conversation_id)

                update_levels = level_ones - 1

                if update_levels == 0:
                    branches_dict.pop(conversation_id)

                else: 
                    branches_dict[conversation_id] = update_levels

                if level_ones > 1: 
                    return conversation_id + '.' + str(level_ones)

                else:
                    return conversation_id + '.1'
            except:
                pass
        # All tweets above level 1 get no conversation_id, this is corrected later 
        else:
            return 'level > 1'

In [17]:
df_conversations['conversation_id'] = df_conversations.apply(lambda tweet: conv_id(tweet.conversation_id, tweet.conversation_level), axis=1)

---

#### Correcting conversation ID of tweets with level > 1 

For all tweets above level one the conversation ID is copied from the tweet it is a reply to. 

In [21]:
df_conversations_1 = df_conversations.head(2000000).copy()
df_conversations_2_1 = df_conversations.head(4000000).copy()
df_conversations_2 = df_conversations_2_1.tail(2000000).copy()
df_conversations_3 = df_conversations.tail(2094135).copy()

In [23]:
count = 0

for index, row in df_conversations_1[df_conversations_1['conversation_level'] > 1].iterrows():
    
    count = count + 1
    
    try:
        df_conversations_1.at[index, 'conversation_id'] = df_conversations_1.iloc[lambda x: x.index == row['reply_to_tweet'], 3].values[0]
    except:
        pass
    
    progress = '{count} / {total}'.format(count = count, total = (df_conversations_1[df_conversations_1['conversation_level'] > 1].shape[0]))
    sys.stdout.write('\r' + progress)

253843 / 253843

In [24]:
count = 0

for index, row in df_conversations_2[df_conversations_2['conversation_level'] > 1].iterrows():
    
    count = count + 1
    
    try:
        df_conversations_2.at[index, 'conversation_id'] = df_conversations_2.iloc[lambda x: x.index == row['reply_to_tweet'], 3].values[0]
    except:
        pass    
        
    progress = '{count} / {total}'.format(count = count, total = (df_conversations_2[df_conversations_2['conversation_level'] > 1].shape[0]))
    sys.stdout.write('\r' + progress)

237324 / 237324

In [25]:
count = 0

for index, row in df_conversations_3[df_conversations_3['conversation_level'] > 1].iterrows():
    
    count = count + 1
    
    try:
        df_conversations_3.at[index, 'conversation_id'] = df_conversations_3.iloc[lambda x: x.index == row['reply_to_tweet'], 3].values[0]
    except:
        pass    
        
    progress = '{count} / {total}'.format(count = count, total = (df_conversations_3[df_conversations_3['conversation_level'] > 1].shape[0]))
    sys.stdout.write('\r' + progress)

252094 / 252094

In [26]:
df_conversations = pd.concat([df_conversations_1, df_conversations_2, df_conversations_3])

---

In [27]:
dict_levels = df_conversations['conversation_level'].to_dict()

In [28]:
dict_id = df_conversations['conversation_id'].to_dict()

In [29]:
df_tweets['conversation_level'] = df_tweets['tweet_id'].map(dict_levels)

In [30]:
df_tweets['conversation_id'] = df_tweets['tweet_id'].map(dict_id)

In [34]:
df_tweets.to_sql('TWEETS', conn_tweets, if_exists="replace", index=False)

In [31]:
conn_tweets.close()

---

In [15]:
stop = datetime.now()

In [17]:
calculate_runtime(start, stop)

'Runtime of Cleaning process for 753283 records was: 0 h, 1 m 4 s'