In [1]:
import pandas as pd
import sqlite3

## 1. Create a connection to the database using the library sqlite3.

In [2]:
con = sqlite3.connect('data/checking-logs.sqlite')

## 2. Create a new table datamart in the database by joining the tables pageviews and checker using only one query.

In [3]:
pd.read_sql('PRAGMA table_info(checker);', con)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,status,TEXT,0,,0
2,2,success,INTEGER,0,,0
3,3,timestamp,TIMESTAMP,0,,0
4,4,numTrials,INTEGER,0,,0
5,5,labname,TEXT,0,,0
6,6,uid,TEXT,0,,0


In [4]:
pd.read_sql('PRAGMA table_info(pageviews);', con)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,uid,TEXT,0,,0
2,2,datetime,TIMESTAMP,0,,0


In [5]:
query = """
CREATE TABLE datamart AS
WITH first_views AS (
    SELECT uid, MIN (datetime) AS first_view_ts
    FROM pageviews
    GROUP BY uid
)
SELECT 
    ch.uid, 
    ch.labname, 
    ch.timestamp AS first_commit_ts,
    fv.first_view_ts
FROM checker ch
LEFT JOIN first_views fv 
    ON ch.uid = fv.uid
WHERE 
    ch.status = 'ready'
    AND ch.numTrials = 1
    AND ch.labname IN ('laba04', 'laba04s', 'laba05', 'laba06', 'laba06s', 'project1')    
    AND ch.uid LIKE 'user_%'
"""

In [6]:
con.execute("DROP TABLE IF EXISTS datamart")
con.execute(query)
con.commit()

In [7]:
datamart = pd.read_sql('SELECT * FROM datamart;', con, parse_dates=['first_commit_ts', 'first_view_ts'])

In [17]:
datamart.count()

uid                140
labname            140
first_commit_ts    140
first_view_ts       59
dtype: int64

## 3. Using Pandas methods, create two dataframes: test and control.

In [9]:
test = datamart[datamart['first_view_ts'].notna()].copy()
test.count()

uid                59
labname            59
first_commit_ts    59
first_view_ts      59
dtype: int64

In [10]:
test['first_view_ts'].isnull().sum().item()

0

In [11]:
control = datamart[datamart['first_view_ts'].isna()].copy()
control.count()

uid                81
labname            81
first_commit_ts    81
first_view_ts       0
dtype: int64

In [12]:
control['first_view_ts'].notnull().sum().item()

0

In [13]:
mean_time = test['first_view_ts'].mean()
mean_time

Timestamp('2020-04-27 00:40:05.761783552')

In [14]:
control['first_view_ts'] = control['first_view_ts'].fillna(mean_time)
control.head()

Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
0,user_4,project1,2020-04-17 05:19:02.744528,2020-04-27 00:40:05.761783552
1,user_4,laba04,2020-04-17 11:33:17.366400,2020-04-27 00:40:05.761783552
2,user_4,laba04s,2020-04-17 11:48:41.992466,2020-04-27 00:40:05.761783552
5,user_2,laba04,2020-04-18 13:42:35.482008,2020-04-27 00:40:05.761783552
6,user_2,laba04s,2020-04-18 13:51:22.291271,2020-04-27 00:40:05.761783552


In [15]:
test.to_sql('test', con, if_exists='replace', index=False)
control.to_sql('control', con, if_exists='replace', index=False)

81

In [16]:
con.close()