# Initialize database files

In [None]:
%cd twitteranalysis
from DataTools.SqliteTools import initialize_master_db, delete_master_db

delete_master_db()
initialize_master_db()

# Run the user parser

In [None]:
%cd twitteranalysis
%run -i Executables/process_user_descriptions_into_words2.py

# Clear queue

In [None]:
%cd twitteranalysis

from Servers.ClientSide import Client
c = Client()

# Each of the listening request handlers needs its queue flushed
c.send_flush_command()

#j = c.send_shutdown_command()

In [None]:
j

# Merge the various sqlite files

In [None]:

%cd twitteranalysis
import environment
import sqlite3

mainDB = environment.MASTER_DB
columnNames = "word_index, sentence_index, word, user_id, tweet_id"
    
otherDbNames = [ 'wordmapping%s.db' % (i) 
            for i in range( 0, environment.MAX_DB_FILES +1)]

otherDBs = [ '%s/%s' % (environment.DB_FOLDER, name) 
            for name in otherDbNames]
    
for db in otherDbNames:
    conn = sqlite3.connect( mainDB )  # Connect to the main database
    curs = conn.cursor()  # Connect a cursor
    
    dbPath = '%s/%s' % (environment.DB_FOLDER, db)
    curs.execute( "ATTACH DATABASE ? as ? ;", (dbPath, 'db') )

    dbNameTableName = "db.word_map_deux"
    tableName = 'word_map_deux'
    curs.execute( "INSERT INTO %s (%s) SELECT %s FROM %s;" % (tableName, columnNames, columnNames, dbNameTableName) )
    conn.commit()
    curs.close()
    conn.close()


# Check data

In [None]:
%cd twitteranalysis
import environment
import sqlite3

In [None]:
actualUsers = 1328927
numberProcessed = 4352
numberEmpty = 332
expectedUsers = numberProcessed - numberEmpty
print('should have %s' % expectedUsers)

In [None]:
conn = sqlite3.connect(environment.MASTER_DB)
r1 = conn.execute("select count( distinct user_id) from word_map_deux")
num_users = r1.fetchone()[0]
print("%s Unique user ids" % num_users)
r2 = conn.execute("select count( word) from word_map_deux")
print("%s rows in master.db" % r2.fetchone())
conn.close()

In [None]:
missing = expectedUsers - num_users
pct_problem = missing / numberProcessed
expected_missing = round(actualUsers * pct_problem)

print("%s users were not saved; this is %s pct of the total processed" %( missing, pct_problem))
print("Projecting %s problem cases" % expected_missing)

NB: 
    - numberUsers processed = 4352
    - users w empty descriptions = 332
    - users non-english = 129
    - expected users: 4020 (less empty)

5/15 6.06
    - master only 
    - 3984 unique ids based on master
    - 67500 rows in master
    - 36 missing
    - 10993 projected problems
    
5/15 1.36
    - lock added to sqlite writer
    - 4085 unique ids based on files
    - 3950 unique ids based on master
    - 67002 rows in master and based on files


5/15 11.12
    - 4059 unique ids based on files
    - 3923 unique ids based on master
    - 66507 rows in master and based on files

5/15 10.39
    - 3653 unique ids based on files
    - 3529 unique ids based on master
    - 60060 rows in master and based on files
    

5/14 9.00
    - 4014 unique ids based on files
    - 2510 unique ids based on master
    - 41973 rows in master and based on files


Before changed to class method w separate call to flush
    - 3591 Unique user ids
    - 47952 rows in master.db

After
    - 4014 Unique user ids
    - 67940 rows in master.db
    



# Figure out what users are missing

In [1]:
%cd twitteranalysis
import sqlite3
import environment
import DataTools.Cursors
cursor = DataTools.Cursors.WindowedUserCursor( language='en' )


(bookmark:twitteranalysis) -> /Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis
/Users/adam/Dropbox/PainNarrativesLab/TwitterDataAnalysis
connection ready


In [2]:
conn = sqlite3.connect(environment.MASTER_DB)
curs = conn.cursor()  # Connect a cursor
    
missing = []

while True:
    user = cursor.next()
    q = "select * from word_map_deux where user_id = %s" % user.userID
    r1 = curs.execute(q)
    r = r1.fetchone()
    if r is None:
        missing.append(user.userID)


curs.close()
conn.close()
print(len(missing))

StopIteration: 

In [3]:
import pandas as pd
from DataTools.DataConnections import MySqlConnection, DAO
conn = MySqlConnection(environment.CREDENTIAL_FILE)
conn._make_engine()

In [4]:
descripts = []
for userId in missing:
    q = 'select description from users where userID = %s' % userId
    v = pd.read_sql_query(q, conn.engine).iloc[0].values[0]
    descripts.append((userId, v))
len(descripts)

425

In [5]:
substantive = [x for x in descripts if x[1] != '']
substantive_ids = [x[0] for x in descripts if x[1] != '']
len(substantive)

93

In [6]:
substantive

[(168391959, '**SKYHIGH**'),
 (401742354, '✌'),
 (452749712, ':)'),
 (818392459,
  "Author of Through Whose Eyes: Rise, Child of God and Toni's Blues. https://t.co/fhT0Jsawce #SURVIVOR #CHILDOFGOD Arizona"),
 (819644227,
  'Always evolving, always changing. Liberal Democrat. Not politically correct. Live in free association. Goal: To experience reality.'),
 (820011216,
  "...Some see us as fish out of the water, trouble makers, and oddballs. But really we are gifted, profound, and uniquely brilliant ... MMOT.. We aren't criminals"),
 (820290150,
  'Registered Dietitian,  Lactation Consultant, Crohns disease.  Passions include: my daughter, the Ocean/ Marine Biology,  Animal rights, Music....'),
 (821616540,
  "The Council of Academic Hospitals of Ontario (CAHO) is the association of Ontario's 24 research hospitals - a focal point for their strategic initiatives."),
 (821962573,
  'Official Twitter Page for California Pain Management. We treat many different Chronic & Acute Pain Conditi

These seem to be due to some problem with the filters. 
they may be behaving as intended...

In [None]:
pd.read_csv?

In [7]:
pe = pd.read_csv("%s/processing-enque.csv" % environment.PROFILING_LOG_FOLDER_PATH, header=None, names=['timestamp', 'userid', 'note'])
len(pe)

4352

In [11]:
p = pe[pe.userid.isin(missing)]

In [14]:
p

Unnamed: 0,timestamp,userid,note
49,2018-05-15T18:35:48.052669,13688882,userid
56,2018-05-15T18:35:48.062851,14094929,userid
177,2018-05-15T18:35:48.318133,16816694,userid
181,2018-05-15T18:35:48.324236,16895968,userid
197,2018-05-15T18:35:48.344990,17294750,userid
227,2018-05-15T18:35:48.393872,17977483,userid
243,2018-05-15T18:35:48.427504,18681791,userid
291,2018-05-15T18:35:48.515352,19888843,userid
300,2018-05-15T18:35:48.528650,19967178,userid
352,2018-05-15T18:35:48.652898,21157018,userid


uids = []
rows = []
print("Unique user ids; rows")
for db in otherDbNames:
    dbPath = '%s/%s' % (environment.DB_FOLDER, db)
    conn = sqlite3.connect( dbPath )  # Connect to the main database
    curs = conn.cursor()  # Connect a cursor
    r1 = conn.execute("select count( distinct user_id) from word_map_deux")
    v =  r1.fetchone()
    uids.append(v[0])
    r2 = conn.execute("select count( word) from word_map_deux")
    v2 =  r2.fetchone()
    rows.append(v2[0])

    print("%s : %s; %s " % (db, v[0], v2[0]))
    conn.close()


In [None]:
conn = sqlite3.connect(environment.MASTER_DB)
r1 = conn.execute("select user_id, count(word) from word_map_deux group by user_id ")
print( r1.fetchone())
conn.close()

In [None]:
conn = sqlite3.connect(environment.MASTER_DB)
curs = conn.cursor()  # Connect a cursor
    
r1 = curs.execute("select * from word_map_deux where user_id = 2")
# for i in range(0, 100):
print( r1.fetchone())

curs.close()

In [None]:
"""The above tells me that the problem is the whole store getting sent to the server"""

# Explore data

In [None]:
def master_row_generator():
    conn = sqlite3.connect(environment.MASTER_DB)
    r = conn.execute("select * from word_map_deux")
    while True:
        yield r.fetchone()

def db_row_generator(filepath):
    conn = sqlite3.connect(filepath)
    r = conn.execute("select * from word_map_deux")
    while True:
        yield r.fetchone()


In [None]:
gen = master_row_generator()

In [None]:
for i in range(0, 220):
    print(next(gen))

In [None]:
f = '/Users/adam/Desktop/TwitterDataAnalysisLogs/dbs/wordmapping1.db'
dgen = db_row_generator(f)
for i in range(0, 220):
    print(next(dgen))

In [None]:
f = '/Users/adam/Desktop/TwitterDataAnalysisLogs/dbs/wordmapping1.db'
conn = sqlite3.connect(f)
r = conn.execute("select * from word_map_deux")
for i in range(0, 5):
    print(r.fetchone())


In [None]:
f = '/Users/adam/Desktop/TwitterDataAnalysisLogs/dbs/wordmapping9.db'
conn = sqlite3.connect(f)
r = conn.execute("select * from word_map_deux")
for i in range(0, 5):
    print(r.fetchone())


In [None]:

f = '/Users/adam/Desktop/TwitterDataAnalysisLogs/master.db'
conn = sqlite3.connect(f)
r = conn.execute("select count(word) from word_map_deux")
print(r.fetchone())
conn.close()

# Timing

In [None]:
%cd twitteranalysis

from functools import wraps
import time

from Loggers.CsvLoggers import log_query, log_query_timestamp
import environment


def time_and_log(fn):
    """
    Decorator to time operation of method
    From High Performance Python, p.27
    """
    @wraps(fn)
    def measure_time(*args, **kwargs):
        t1 = time.time()
        result = fn(*args, **kwargs)
        t2 = time.time()
        t2 = time.time()
        elapsed = t2 - t1
        log_query( elapsed )
        log_query_timestamp()
#         print(("@timefn:%s took %s seconds" % (fn.__name__, elapsed)))
        return result
    return measure_time


In [None]:

@time_and_log
def test():
    time.sleep(2)
    print('done')
    

In [None]:
test()