# Data Exporter

### 1. Lets first Load all the data

In [1]:
import json
import datetime
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_2bfc56ae673c4e3aad4c3da569258436 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='XXX',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')

def fetcher(dfs, date):
    body = client_2bfc56ae673c4e3aad4c3da569258436.get_object(Bucket='sentimentanalysisproject-donotdelete-pr-pkfzekostvak36',Key='data_' + date + '.csv')['Body']
    # add missing __iter__ method, so pandas accepts body as file-like object
    if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

    dfs.append(pd.read_csv(body, usecols=['Tweet Posted Time (UTC)', 'Tweet Content', 'Tweet Location']))
    
date = datetime.datetime(2020, 3, 1)
dfs = []
for _ in range(101):
    fetcher(dfs, date.strftime('%Y-%m-%d'))
    date += datetime.timedelta(days=1)
df = pd.concat(dfs, ignore_index=True)
df.columns = ['Date', 'Text', 'Location']

ModuleNotFoundError: No module named 'botocore'

In [None]:
df.head()

In [None]:
df.shape

### Format the data and make it presentable

In [None]:
def dateFormater(date):
    date = datetime.datetime.strptime(date, '%Y-%m-%d')
    return date.strftime('%d/%m/%Y')

epoch = datetime.datetime.utcfromtimestamp(0)

def timeStamp(date):
    date = datetime.datetime.strptime(date, '%d/%m/%Y')
    return (date - epoch).total_seconds() * 1000.0

In [None]:
df['Date'] = df['Date'].apply(dateFormater)

In [None]:
df.head()

In [None]:
df['TimeStamp'] = df.apply(lambda row: timeStamp(row['Date']), axis=1)

In [None]:
df.head()

### Extracting some more data

In [None]:
import re
def hash_extractor(text):
    hash_tags = re.findall('(#[^\s]*)', text)
    return hash_tags

def mention_extractor(text):
    user_mentions = re.findall('(@[^\s]*)', text)
    return user_mentions

In [None]:
df['HashTags'] = df.apply(lambda row: json.dumps(hash_extractor(row['Text'])), axis=1)

In [None]:
df.head()

In [None]:
df['UserMentions'] = df.apply(lambda row: json.dumps(mention_extractor(row['Text'])), axis=1)

In [None]:
df.head()

### Lets have a field with the sentiment score of the tweets

In [None]:
#!pip install -U textblob

In [None]:
from textblob import TextBlob

In [None]:
df['SentimentScore'] = df['Text'].apply(lambda text: TextBlob(text).sentiment.polarity)

In [None]:
df.head()

### Finally package the data and send to Cloudant

In [None]:
#!pip install cloudant

In [None]:
from cloudant.client import Cloudant
from cloudant.error import CloudantException
from cloudant.result import Result, ResultByKey
from cloudant.database import CloudantDatabase

username = "XXX"
apikey = "XXX"

client = Cloudant.iam(username, apikey)
client.connect()

In [None]:
db = CloudantDatabase(client, 'sentimentdb')
if db.exists():
    print('DB already exists')
else:
    db.create()
    print('DB created')

In [None]:
def uploadData(db, l):
    i = 0
    result = db.bulk_docs(l)
    k = []
    for x, num in zip(result, range(len(l))):
        if x['ok'] == True:
            i+=1
        else:
            k.append(l[num])
    #sleep(1)
    return (k,i)

In [None]:
from time import sleep
num = 0
l = []
for index, row in df.iterrows():
    json_doc = {
       'id': index,
        'date': row['Date'],
        'text': row['Text'],
        'timestamp': int(row['TimeStamp']),
        'location': row['Location'],
        'hash_tags': row['HashTags'],
        'users_mentioned': row['UserMentions'],
        'sentiment_score': row['SentimentScore'],
    }
    l.append(json_doc)
    if len(l) > 9999:
        #db.create_document(json_doc)
        l, i = uploadData(db, l)
        num+=i
        print('\rObject : ' + str(num) + '/' + str(df.shape[0]), end='')
if len(l) > 0:
    l, i = uploadData(db, l)
    num+=i
    print('\rObject : ' + str(num) + '/' + str(df.shape[0]), end='')
print('Done')

In [None]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='XXX', project_access_token='XXX')
pc = project.project_context
project.save_data('final_data.csv', df.to_csv(index=False), overwrite=True)

In [None]:
import json
import datetime
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_2bfc56ae673c4e3aad4c3da569258436 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='XXX',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')

body = client_2bfc56ae673c4e3aad4c3da569258436.get_object(Bucket='sentimentanalysisproject-donotdelete-pr-pkfzekostvak36',Key='final_data.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df = pd.read_csv(body)

In [None]:
df.head()

In [None]:
db = CloudantDatabase(client, 'resultdb')
if db.exists():
    print('DB already exists')
else:
    db.create()
    print('DB created')

In [None]:
dates = {}
location = {}
hashtags = {}
usermention = {}
for _, row in df.iterrows():
    # based on date
    ts = str(int(row['TimeStamp']))
    if ts not in dates:
        dates[ts] = {}
        dates[ts]['positive'] = 0
        dates[ts]['negetive'] = 0
        dates[ts]['neutral'] = 0
    if row['SentimentScore'] > 0:
        dates[ts]['positive'] += 1
    elif row['SentimentScore'] < 0:
        dates[ts]['negetive'] += 1
    else:
        dates[ts]['neutral'] += 1
    # based on date ends
    # on location
    loc = row['Location']
    if loc not in location:
        location[loc] = {}
        location[loc]['total'] = {}
        location[loc]['total']['positive'] = 0
        location[loc]['total']['negetive'] = 0
        location[loc]['total']['neutral'] = 0
    if ts not in location[loc]:
        location[loc][ts] = {}
        location[loc][ts]['positive'] = 0
        location[loc][ts]['negetive'] = 0
        location[loc][ts]['neutral'] = 0
    if row['SentimentScore'] > 0:
        location[loc][ts]['positive'] += 1
        location[loc]['total']['positive'] += 1
    elif row['SentimentScore'] < 0:
        location[loc][ts]['negetive'] += 1
        location[loc]['total']['negetive'] += 1
    else:
        location[loc][ts]['neutral'] += 1
        location[loc]['total']['neutral'] += 1
    #location ends
    # based on hashtag
    for ht in json.loads(row['HashTags']):
        if ht not in hashtags:
            hashtags[ht] = {}
            hashtags[ht]['positive'] = 0
            hashtags[ht]['negetive'] = 0
            hashtags[ht]['neutral'] = 0
            hashtags[ht]['total'] = 0
        hashtags[ht]['total'] += 1
        if row['SentimentScore'] > 0:
            hashtags[ht]['positive'] += 1
        elif row['SentimentScore'] < 0:
            hashtags[ht]['negetive'] += 1
        else:
            hashtags[ht]['neutral'] += 1
    # end of hashtag
    # based on usersmentioned
    #print(row['UserMentions'])
    for um in json.loads(row['UserMentions']):
        if um not in usermention:
            usermention[um] = {}
            usermention[um]['positive'] = 0
            usermention[um]['negetive'] = 0
            usermention[um]['neutral'] = 0
            usermention[um]['total'] = 0
        usermention[um]['total'] += 1
        if row['SentimentScore'] > 0:
            usermention[um]['positive'] += 1
        elif row['SentimentScore'] < 0:
            usermention[um]['negetive'] += 1
        else:
            usermention[um]['neutral'] += 1
    # end of usermentions

In [None]:
#removing extra hashtags
for k in list(hashtags):
    if hashtags[k]['total'] < 20:
        del hashtags[k]

In [None]:
for k in list(usermention):
    if usermention[k]['total'] < 10:
        del usermention[k]

In [None]:
json_doc = {
    '_id': 'date',
    'date': dates
}
db.create_document(json_doc)

In [None]:
json_doc = {
    '_id': 'location',
    'location': location
}
db.create_document(json_doc)

In [None]:
json_doc = {
    '_id': 'hashtag',
    'hashtag': hashtags
}
db.create_document(json_doc)

In [None]:
json_doc = {
    '_id': 'usermention',
    'usermention': usermention
}
db.create_document(json_doc)