In [1]:
import os
import sqlite3
import warnings

import pandas as pd
import numpy as np

warnings.simplefilter("ignore")

## Loading data

In [2]:
ROOT_DIR = "travel_stackexchange_com"
BASE = "solution.db"

if os.path.exists(BASE):
    os.remove(BASE)

CONN = sqlite3.connect(BASE)


def load_df(df_name, extension="csv", dir=ROOT_DIR, **kwargs):
    return pd.read_csv(os.path.join(dir, df_name + "." + extension), **kwargs)


def load_to_db(df, name, conn=CONN):
    df.to_sql(name, conn)


def query_db(query, conn=CONN):
    return pd.read_sql_query(query, conn)


def assert_equal(df1, df2):
    assert df1.equals(df2), "Scores do not match"

In [3]:
Posts = load_df("Posts")
Users = load_df("Users")
Comments = load_df("Comments")

In [4]:
load_to_db(Posts, "Posts")
load_to_db(Users, "Users")
load_to_db(Comments, "Comments")

##  Ex 1

In [5]:
ex1_query = """
SELECT Location, SUM(UpVotes) as TotalUpVotes 
FROM Users
WHERE Location != ''
GROUP BY Location
ORDER BY TotalUpVotes DESC LIMIT 10
"""

In [6]:
ex1_sql = query_db(ex1_query)

In [7]:
ex1_pd = (
    Users[Users["Location"] != ""]
    .groupby("Location")["UpVotes"]
    .sum()
    .rename("TotalUpVotes")
    .sort_values(ascending=False)
    .reset_index()
    .head(10)
)

In [8]:
assert_equal(ex1_sql, ex1_pd)

## Ex 2

In [9]:
ex2_query = """
SELECT STRFTIME('%Y', CreationDate) AS Year, STRFTIME('%m', CreationDate) AS Month, COUNT(*) AS PostsNumber, MAX(Score) AS MaxScore
FROM Posts
WHERE PostTypeId IN (1, 2) 
GROUP BY Year, Month
HAVING PostsNumber > 1000
"""

In [10]:
ex2_sql = query_db(ex2_query)

In [11]:
ex2_pd = Posts[Posts["PostTypeId"].isin([1, 2])][["CreationDate", "Score"]]

CreationDate = pd.to_datetime(ex2_pd["CreationDate"])
ex2_pd["Year"] = CreationDate.dt.strftime("%Y")
ex2_pd["Month"] = CreationDate.dt.strftime("%m")

ex2_pd = ex2_pd.groupby(["Year", "Month"])
MaxScore = ex2_pd["Score"].max().reset_index(drop=True)
ex2_pd = ex2_pd["Score"].count().reset_index().rename(columns={"Score": "PostsNumber"})
ex2_pd["MaxScore"] = MaxScore

ex2_pd = ex2_pd[ex2_pd["PostsNumber"] > 1000].reset_index(drop=True)

In [12]:
assert_equal(ex2_sql, ex2_pd)

## Ex 3

In [13]:
Questions_query = """
SELECT OwnerUserId, SUM(ViewCount) as TotalViews
FROM Posts
WHERE PostTypeId = 1
GROUP BY OwnerUserId
"""
ex3_query = """
SELECT Id, DisplayName, TotalViews
FROM Questions
JOIN Users
ON Users.Id = Questions.OwnerUserId
ORDER BY TotalViews DESC
LIMIT 10
"""

In [14]:
Questions = query_db(Questions_query)
load_to_db(Questions, "Questions")
ex3_sql = query_db(ex3_query)

In [15]:
Questions = (
    Posts[Posts["PostTypeId"] == 1][["OwnerUserId", "ViewCount"]]
    .groupby("OwnerUserId")["ViewCount"]
    .sum()
    .rename("TotalViews")
    .reset_index()
)
ex3_pd = (
    pd.merge(Questions, Users, left_on="OwnerUserId", right_on="Id")[
        ["Id", "DisplayName", "TotalViews"]
    ]
    .sort_values("TotalViews", ascending=False)
    .head(10)
    .reset_index(drop=True)
)

In [16]:
assert_equal(ex3_sql, ex3_pd)

## Ex 4

In [17]:
ex4_query = """
SELECT DisplayName, QuestionsNumber, AnswersNumber, Location, Reputation, UpVotes, DownVotes
FROM (
        SELECT * FROM 
            (
                SELECT COUNT(*) as AnswersNumber, OwnerUserId FROM Posts
                WHERE PostTypeId = 2
                GROUP BY OwnerUserId
            ) AS Answers 
        JOIN
            (
                SELECT COUNT(*) as QuestionsNumber, OwnerUserId FROM Posts
                WHERE PostTypeId = 1
                GROUP BY OwnerUserId
            ) AS Questions
        ON Answers.OwnerUserId = Questions.OwnerUserId 
        WHERE AnswersNumber > QuestionsNumber
        ORDER BY AnswersNumber DESC
        LIMIT 5
    ) AS PostsCounts 
JOIN Users
ON PostsCounts.OwnerUserId = Users.Id
"""

In [18]:
ex4_sql = query_db(ex4_query)

In [19]:
Answers = (
    Posts[Posts["PostTypeId"] == 2]
    .groupby("OwnerUserId")["OwnerUserId"]
    .count()
    .rename("AnswersNumber")
    .reset_index()
)
Questions = (
    Posts[Posts["PostTypeId"] == 1]
    .groupby("OwnerUserId")["OwnerUserId"]
    .count()
    .rename("QuestionsNumber")
    .reset_index()
)
PostsCounts = pd.merge(Answers, Questions, on="OwnerUserId")
PostsCounts = (
    PostsCounts[PostsCounts["AnswersNumber"] > PostsCounts["QuestionsNumber"]]
    .sort_values("AnswersNumber", ascending=False)
    .head(5)
    .reset_index(drop=True)
)
ex4_pd = pd.merge(PostsCounts, Users, left_on="OwnerUserId", right_on="Id")[
    [
        "DisplayName",
        "QuestionsNumber",
        "AnswersNumber",
        "Location",
        "Reputation",
        "UpVotes",
        "DownVotes",
    ]
]

In [20]:
assert_equal(ex4_sql, ex4_pd)

## Ex 5

In [21]:
CmtTotScr_query = """
SELECT PostId, SUM(Score) AS CommentsTotalScore 
FROM Comments
GROUP BY PostId
"""
PostsBestComments_query = """
 SELECT Posts.OwnerUserId, Posts.Title, Posts.CommentCount, Posts.ViewCount, CmtTotScr.CommentsTotalScore
FROM CmtTotScr
JOIN Posts ON Posts.Id = CmtTotScr.PostId 
WHERE Posts.PostTypeId=1
"""
ex5_query = """
SELECT Title, CommentCount, ViewCount, CommentsTotalScore, DisplayName, Reputation, Location 
FROM PostsBestComments
JOIN Users ON PostsBestComments.OwnerUserId = Users.Id 
ORDER BY CommentsTotalScore DESC
LIMIT 10
"""

In [22]:
CmtTotScr = query_db(CmtTotScr_query)
load_to_db(CmtTotScr, "CmtTotScr")
PostsBestComments = query_db(PostsBestComments_query)
load_to_db(PostsBestComments, "PostsBestComments")
ex5_sql = query_db(ex5_query)

In [23]:
CmtTotScr = (
    Comments[["PostId", "Score"]]
    .groupby("PostId")["Score"]
    .sum()
    .rename("CommentsTotalScore")
    .reset_index()
)
PostsBestComments = Posts[Posts["PostTypeId"] == 1].merge(
    CmtTotScr, left_on="Id", right_on="PostId"
)[["OwnerUserId", "Title", "CommentCount", "ViewCount", "CommentsTotalScore"]]
ex5_pd = (
    pd.merge(Users, PostsBestComments, left_on="Id", right_on="OwnerUserId")[
        [
            "Title",
            "CommentCount",
            "ViewCount",
            "CommentsTotalScore",
            "DisplayName",
            "Reputation",
            "Location",
        ]
    ]
    .sort_values("CommentsTotalScore", ascending=False)
    .head(10)
    .reset_index(drop=True)
)

In [24]:
assert_equal(ex5_sql, ex5_pd)

## Exit actions

In [25]:
CONN.close()