## Preprocessing (1)

In [1]:
import pandas as pd
import sqlite3
import re
from datetime import datetime

pd.set_option('display.max_colwidth', None)  

---

In [6]:
def calculate_runtime(start, stop):
    """
    Calculates running time of every process. This cell is repeated in every notebook. 
    :param start: start time
    :param stop: stop time
    :return: output corresponding to running time for process 
    """
    
    runtime = stop - start
    
    days, seconds = runtime.days, runtime.seconds
    
    hours = days * 24 + seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = (seconds % 60)
    
    return 'Runtime preprocessing {} records was: {} h, {} m {} s'.format(records, hours, minutes, seconds)

---

In [3]:
start = datetime.now()

---

In [2]:
conn_tweets = sqlite3.connect('tweets_airlines.db')

---

### Cleaning tables by sorting on created columns

In [4]:
query_tweets = 'SELECT * FROM TWEETS'
df_tweets = pd.read_sql_query(query_tweets, conn_tweets)

In [5]:
df_tweets = df_tweets.sort_values(by='created_at')

In [6]:
query_users = 'SELECT * FROM USERS'
df_users = pd.read_sql_query(query_users, conn_tweets)

In [7]:
df_users = df_users.sort_values(by='tweet_created_at')

---

In [8]:
records = df_tweets.shape[0]

---

### Removing duplicates

In [9]:
df_tweets = df_tweets.drop_duplicates(subset=['tweet_id'])
df_users = df_users.drop_duplicates(subset=['tweet_id'])

---

### Cleaning text

In [10]:
def clean_text(text, mentioned_users): 
    """
    Tokenizes all text columns and only keeps part of the text that are useful. 
    :param text: string with text of the tweet
    :param mentioned_users: users mentioned in the tweet 
    :return: string of tokenized text 
    """
    split = text.split()
    
    # Remove mentioned users at start of tweet

    if split[0] == 'RT':
        clean = [re.sub('@\w+:', '', i) for i in split]
        
        m = (mentioned_users.count(","))
        
        clean[:m+1] = [re.sub('@\w+', '', i) for i in clean[:m+1]]
    
    elif split[0].startswith('@'):
        m = (mentioned_users.count(",")) + 1
        
        clean = split
        
        clean[:m] = [re.sub('@\w+', '', i) for i in clean[:m]]
        
    # Do not remove mentioned users when they don't appear at the start or after RT 
    
    else: 
        clean = split
        
    # Remove all links 
        
    tokenized_tweet = [re.sub('https:\S+', '', i) for i in clean]
    
    return ' '.join(tokenized_tweet)

In [11]:
df_tweets['tokenized_text'] = df_tweets.apply(lambda text: clean_text(text.full_text, text.mentioned_users), axis=1)

---

In [12]:
df_users.to_sql('USERS', conn_tweets, if_exists="replace", index=False)

In [13]:
df_tweets.to_sql('TWEETS', conn_tweets, if_exists="replace", index=False)

---

In [14]:
conn_tweets.close()

---

In [15]:
stop = datetime.now()

In [17]:
calculate_runtime(start, stop)

'Runtime of Cleaning process for 753283 records was: 0 h, 1 m 4 s'