# Exporting dataframe for R

A part of the analytical process (the generation of CTM and STM models) has to be done in R. To keep the analysis consistent the processing of the tweets needs to remain unchanged, so the processed tweets and relevant metadata are processed into a dataframe that can be easily read by R. 

Here the processed tokens are also joined together into a string with spaces separating the tokens. This is because strings are easier to read into an R environment than a large number of lists of strings. The string is then separated into tokens once again in the R environment. 

In [1]:
#LOADING INDIVIDUAL TWEETS
from os import listdir
import json
import logging
import pandas as pd
from nltk.corpus import stopwords

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

path = 'PATH'
tweets = []
stops = set(stopwords.words('dutch'))

for month in ['08']: #controls for month 
    for file in listdir(path):
        if file.split('_')[0] == month:   
            with open(path + '/' + file, 'r') as infile:
                data = json.loads(infile.read())
                for identifier in data.keys(): 
                    tweet = []
                    for token in range(len(data[identifier]['full_frog'])):
                        if data[identifier]['full_frog'][token]['dep'] != 'punct':
                            if data[identifier]['full_frog'][token]['lemma'] not in stops:
                                tweet.append(data[identifier]['full_frog'][token]['lemma'].lower())
                    tweet = ' '.join(tweet)
                    tweets.append([identifier, data[identifier]['created_at'], tweet, data[identifier]['user']['followers_count'], 
                                  data[identifier]['user']['monthly_tweets'], data[identifier]['user']['monthly_relevant_tweets']])
                


In [2]:
from datetime import datetime
import feather

#Creating dataframe that can be easily read in R
df = pd.DataFrame(tweets, columns=["Identifier", "Timestamp", "Text", "Followers", "Frequency", "Relevant Frequency"])

start_time = datetime.strptime('Wed Aug 1 00:00:00 +0000 2018', '%a %b %d %X %z %Y')

indays = [((datetime.strptime(time, '%a %b %d %X %z %Y')-start_time).days)+1 for time in df['Timestamp']]

df['Days'] = indays

#Saving the dataframe as a feather file that can be read by R
feather.write_dataframe(df, 'C:/dataframe.feather')