#### 1. Import library

In [2]:
import os
import pandas as pd 
import numpy as np 
from typing import Dict, List, Tuple 
import sqlite3
import json
from tools.cleaning import remove_duplicated_text, drop_invalid_reply
import pickle
import time 

#### 2. Load the data onto DataFrame

In [4]:
def get_lst_json(path) -> List[str]:
    """
    get list of path to each json file in the folder
    
    :parameter path: path the the folder 
    """
    import os
    file_list = [f'{path}/' + i for i in os.listdir(f'{path}/')]
    return file_list

lst = get_lst_json('../data')

In [6]:
# load the json file of one month onto dataframe 
start = time.time()
def load_json(lst_json: List[str]) -> pd.DataFrame:
    """load and concat all dataframe from json files
    """
    dfs = [] # an empty list to store the data frames
    for file in lst_json:
        try:
            data = pd.read_json(file, lines=True) # read data frame from json file 
            print(file) 
            dfs.append(data) # append the data frame to the list
        except:
            print('error')
            pass
    df_data = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.
    return df_data

df_data = load_json(lst)
end = time.time()
print(end - start)

../data/airlines-1558527599826.json
../data/airlines-1558546003827.json
error
error
../data/airlines-1558678330070.json
../data/airlines-1558697205154.json
../data/airlines-1558744391657.json
../data/airlines-1558779687636.json
../data/airlines-1558863520888.json
../data/airlines-1558888297881.json
../data/airlines-1558974571041.json
../data/airlines-1558998029487.json
../data/airlines-1559069822287.json
../data/airlines-1559097885748.json
../data/airlines-1559142170524.json
../data/airlines-1559156713030.json
../data/airlines-1559231904332.json
../data/airlines-1559256655329.json
../data/airlines-1559323725358.json
../data/airlines-1559352498975.json


TypeError: unsupported operand type(s) for -: 'builtin_function_or_method' and 'builtin_function_or_method'

In [30]:
#drop invalid datapoints and reset index
df_data = df_data.dropna(subset = ["user"]).reset_index(drop=True)

In [31]:
#drop duplicated tweets
df_data = df_data.drop_duplicates(subset='id') #subset for duplication check is tweet id

In [32]:
#drop invalid tweets
df_data = drop_invalid_reply(df_data)
df_data.shape

(81614, 38)

In [22]:
#Add the user_id attribute

def build_attribute(df : pd.DataFrame, attr=['user_id']) -> pd.DataFrame:
    """
    build attribute for dataframe. build user_id column
    
    :parameter
    """
    if 'user_id' in attr:
        df_user = [df.user.iloc[i].get('id') for i in range(len(df))]
        df['user_id'] = pd.Series(df_user)
    
    return df

df_data = build_attribute(df_data)

In [51]:
#create pickle files
#@Tian please make use of pickle files so next time u don't have to load everything again - @Phat
df_data.to_pickle('../pickle_files/everything_tweets.p')

In [24]:
#load the pickle files back 
#@Tian don't run this one when u load data for the first time
start = time.time()
df_data = pd.read_pickle('../pickle_files/everything_tweets.p')
end = time.time()
print ("Time elapsed:", end - start)

Time elapsed: 8.11237120628357


In [33]:
#drop duplicated tweets with identical text from same users to the same original tweet
df_data = remove_duplicated_text(df_data)
df_data.shape

(81614, 38)

In [34]:
#drop all the tweets that are retweet
df_data = df_data[df_data.retweeted_status.isna()]
df_data.shape

(81614, 38)

#### 3. Store the data on .db file 

In [7]:
# create connection and cursor 
con = sqlite3.connect('../database/sqlite-tools-win32-x86-3380300/database.db') #path to the database.db
cur = con.cursor()

#### 4. KLM table 

In [35]:
#Extract only tweets made by KLM (including posting +replying to other):
def extract_tweet_by_airlines(id : int, df_data : pd.DataFrame) -> pd.DataFrame:
    """
    extract tweets that made by an airline
    :parameter id: the user_id of the airline
    :return DataFrame with one tweets made by that airline
    """
    
    ind = []
    for index, row in df_data.iterrows():
        if row['user']['id'] == id:
            ind.append(index)
    airline_tweet = df_data.loc[ind]
    return airline_tweet

klm_tweet = extract_tweet_by_airlines(56377143, df_data)

In [36]:
#Extract only tweets mention KLM (@KLM etc...):
def extract_tweet_mention_airlines(airline_name : str, df_data : pd.DataFrame) -> pd.DataFrame:
    """
    extract tweets that made by an airline
    :parameter airline_name: the string name of the airline
    :return DataFrame with one tweets mention that airline
    """

    ind2 = []
    for index, row in df_data.iterrows():
        if airline_name in row['text']:
            ind2.append(index)
    airline_tweet_men = df_data.loc[ind2]
    return airline_tweet_men


klm_tweet_men = extract_tweet_mention_airlines("@KLM", df_data)

In [37]:
#Concat 2 tables:
klm_table = pd.concat([klm_tweet, klm_tweet_men])
klm_table.drop_duplicates(subset=['id'], inplace=True)
klm_table.reset_index(inplace=True, drop=True)

In [38]:
#Add the user_id attribute
klm_table = build_attribute(klm_table, attr=['user_id'])

In [39]:
klm_table.shape

(3274, 38)

#### 5. Bristish Airways table

In [42]:
#Extract only tweets made by BA (including posting +replying to other):
british_tweet = extract_tweet_by_airlines(18332190, df_data)

In [43]:
#Extract only tweets mention BA (@British_Airways etc...):
british_tweet_men = extract_tweet_mention_airlines("@British_Airways", df_data)

In [44]:
#Concat 2 tables:
british_table = pd.concat([british_tweet, british_tweet_men])
british_table.drop_duplicates(subset=['id'], inplace=True)
british_table.reset_index(drop=True, inplace=True)

In [45]:
#Add the user_id attribute
british_table['user_id'] = pd.Series(british_user)
british_table.shape

NameError: name 'british_user' is not defined

#### 6. Users table 

In [None]:
#Get a list of dictionary of user information and turn them to dataframe
user_table = [df_data.user.iloc[i] for i in range(len(df_data))]

In [None]:
#Turn them to DataFrame and drop the duplicate
user_table = pd.DataFrame(user_table).drop_duplicates(subset='id')

In [None]:
#create pickle files
#@Tian please make use of pickle files so next time u don't have to load everything again - @Phat
user_table.to_pickle('../pickle_files/users.p')

#### 7. Drop attributes 

In [46]:
attr_to_dropped = ['user']
df_data.drop(columns=attr_to_dropped, inplace=True)

#### 8. Export to csv files

In [None]:
user_table.to_csv('database/CSV files/user_table.csv')

In [6]:
df_data.to_csv('database/CSV files/new_general_tweets.csv')

In [None]:
#Export the table as CSV file to import to the database
klm_table.to_csv('database/CSV files/klm_table_diff.csv')

In [None]:
#Export the table as CSV file to import to the database
british_table.to_csv('database/CSV files/british_table.csv')

#### 9 Export to pickle files 

In [None]:
#create pickle files
#@Tian please make use of pickle files so next time u don't have to load everything again - @Phat
user_table.to_pickle('../pickle_files/users.p')
df_data.to_pickle('../pickle_files/cleaned_everything_tweets.p')