## ⅍ Aggregations

#### 1. 🔗 Подключаемся к базе данных

In [50]:
import pandas as pd
import sqlite3

db_path = "checking-logs.sqlite"
conn = sqlite3.connect(db_path)

#### 2. Получение схемы таблиц test  и deadlines

In [51]:
schema_query = "PRAGMA table_info(test);"
schema = pd.io.sql.read_sql(schema_query, conn)
print("Схема таблицы test:")
print(schema)
schema_query = "PRAGMA table_info(deadlines);"
schema = pd.io.sql.read_sql(schema_query, conn)
print("\nСхема таблицы deadlines:")
print(schema)

Схема таблицы test:
   cid             name       type  notnull dflt_value  pk
0    0              uid       TEXT        0       None   0
1    1          labname       TEXT        0       None   0
2    2  first_commit_ts  TIMESTAMP        0       None   0
3    3    first_view_ts  TIMESTAMP        0       None   0

Схема таблицы deadlines:
   cid       name     type  notnull dflt_value  pk
0    0      index  INTEGER        0       None   0
1    1       labs     TEXT        0       None   0
2    2  deadlines  INTEGER        0       None   0


#### 3. Получение первых 10 строк таблиц test и deadlines

In [52]:
preview_query = "SELECT * FROM test LIMIT 10;"
preview = pd.io.sql.read_sql(preview_query, conn)
print("\nПервые 10 строк таблицы test:")
print(preview)
preview_query = "SELECT * FROM deadlines LIMIT 10;"
preview = pd.io.sql.read_sql(preview_query, conn)
print("\nПервые 10 строк таблицы deadlines:")
print(preview)


Первые 10 строк таблицы test:
       uid   labname             first_commit_ts               first_view_ts
0   user_1    laba04  2020-04-26 17:06:18.462708  2020-04-26 21:53:59.624136
1   user_1   laba04s  2020-04-26 17:12:11.843671  2020-04-26 21:53:59.624136
2   user_1    laba05  2020-05-02 19:15:18.540185  2020-04-26 21:53:59.624136
3   user_1    laba06  2020-05-17 16:26:35.268534  2020-04-26 21:53:59.624136
4   user_1   laba06s  2020-05-20 12:23:37.289724  2020-04-26 21:53:59.624136
5   user_1  project1  2020-05-14 20:56:08.898880  2020-04-26 21:53:59.624136
6  user_10    laba04  2020-04-25 08:24:52.696624  2020-04-18 12:19:50.182714
7  user_10   laba04s  2020-04-25 08:37:54.604222  2020-04-18 12:19:50.182714
8  user_10    laba05  2020-05-01 19:27:26.063245  2020-04-18 12:19:50.182714
9  user_10    laba06  2020-05-19 11:39:28.885637  2020-04-18 12:19:50.182714

Первые 10 строк таблицы deadlines:
   index      labs   deadlines
0      0    laba04  1587945599
1      1   laba04s  1587

#### 4. SQL-запрос для минимального delta между первым коммитом и дедлайном

In [53]:
query_min = """
SELECT t.uid, 
       MIN((julianday(datetime(d.deadlines, 'unixepoch')) - julianday(t.first_commit_ts)) * 24) AS delta_hours
FROM test AS t
JOIN deadlines AS d ON LOWER(t.labname) = LOWER(d.labs)
WHERE t.labname != 'project1'
GROUP BY t.uid
ORDER BY delta_hours ASC
LIMIT 1;
"""

df_min = pd.read_sql(query_min, conn)
print(df_min.head())

       uid  delta_hours
0  user_25     2.867236


#### 5. SQL-запрос для максимального delta

In [54]:
query_max = """
SELECT t.uid, 
       MAX((julianday(datetime(d.deadlines, 'unixepoch')) - julianday(t.first_commit_ts)) * 24) AS delta_hours
FROM test AS t
JOIN deadlines AS d ON LOWER(t.labname) = LOWER(d.labs)
WHERE t.labname != 'project1'
GROUP BY t.uid
ORDER BY delta_hours DESC
LIMIT 1;
"""
df_max = pd.read_sql(query_max, conn)
print(df_max.head())

       uid  delta_hours
0  user_30    202.38473


#### 6. SQL-запрос для среднего delta

In [55]:
query_avg = """
SELECT AVG((julianday(datetime(d.deadlines, 'unixepoch')) - julianday(t.first_commit_ts)) * 24) AS avg_delta_hours
FROM test AS t
JOIN deadlines AS d ON LOWER(t.labname) = LOWER(d.labs)
WHERE t.labname != 'project1';
"""
df_avg = pd.read_sql(query_avg, conn)
print(df_avg.head())

   avg_delta_hours
0        89.687686


#### 7. SQL-запрос для связи количества просмотров и delta

In [56]:
query_views_diff = """
SELECT t.uid, 
       AVG((julianday(datetime(d.deadlines, 'unixepoch')) - julianday(t.first_commit_ts)) * 24) AS avg_diff,
       COUNT(p.datetime) AS pageviews
FROM test AS t
JOIN deadlines AS d ON LOWER(t.labname) = LOWER(d.labs)
LEFT JOIN pageviews AS p ON t.uid = p.uid
WHERE t.labname != 'project1'
GROUP BY t.uid;
"""
views_diff = pd.read_sql(query_views_diff, conn)
print(views_diff.head())

       uid    avg_diff  pageviews
0   user_1   65.119644        140
1  user_10   75.242310        445
2  user_14  159.568696        429
3  user_17   62.207513        235
4  user_18    6.367907          9


#### 8. Корреляция

In [57]:
correlation = views_diff[['avg_diff', 'pageviews']].corr().iloc[0, 1]

conn.close()

print("\nКоэффициент корреляции между средним delta и количеством просмотров:","\n" "\n                   ",correlation)


Коэффициент корреляции между средним delta и количеством просмотров: 

                    0.18504199436324814
