### Download the Data
http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [1]:
!pip install wget



In [2]:
import wget
fn = wget.download('http://files.grouplens.org/datasets/movielens/ml-latest-small.zip')
fn

'ml-latest-small (1).zip'

In [3]:
!unzip ml-latest*

Archive:  ml-latest-small.zip
caution: filename not matched:  ml-latest-small (1).zip
caution: filename not matched:  ml-latest-small.zip


### Basic Info About Data

In [0]:
PATH = 'ml-latest-small'

In [5]:
!find $PATH -name '*.csv' | xargs wc -l | sort -nr

 124007 total
 100837 ml-latest-small/ratings.csv
   9743 ml-latest-small/movies.csv
   9743 ml-latest-small/links.csv
   3684 ml-latest-small/tags.csv


In [6]:
# find which seperator is used to seperate the columns of each csv file
!head -1 $PATH/*.csv


==> ml-latest-small/links.csv <==
movieId,imdbId,tmdbId

==> ml-latest-small/movies.csv <==
movieId,title,genres

==> ml-latest-small/ratings.csv <==
userId,movieId,rating,timestamp

==> ml-latest-small/tags.csv <==
userId,movieId,tag,timestamp


> * We can see that all files are comma-seperated.

### Read the Dataset

In [0]:
import pandas as pd
import numpy as np
import re
import os

from collections import defaultdict

In [0]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 1000)

In [0]:
db = defaultdict(pd.DataFrame)

for filename in os.listdir(PATH):
    if filename.endswith('.csv'):
        db[filename.split('.')[0]] = pd.read_csv(f"{PATH}/{filename}")
        

In [10]:
pd.DataFrame(db.items(), columns=['table_name', 'table_data'])

Unnamed: 0,table_name,table_data
0,ratings,userId movieId rating timestamp 0 1 1 4.0 964982703 1 1 3 4.0 964981247 2 1 6 4.0 964982224 3 1 47 5.0 964983815 4 1 50 5.0 964982931 ... ... ... ... ... 100831 610 166534 4.0 1493848402 100832 610 168248 5.0 1493850091 100833 610 168250 5.0 1494273047 100834 610 168252 5.0 1493846352 100835 610 170875 3.0 1493846415 [100836 rows x 4 columns]
1,links,movieId imdbId tmdbId 0 1 114709 862.0 1 2 113497 8844.0 2 3 113228 15602.0 3 4 114885 31357.0 4 5 113041 11862.0 ... ... ... ... 9737 193581 5476944 432131.0 9738 193583 5914996 445030.0 9739 193585 6397426 479308.0 9740 193587 8391976 483455.0 9741 193609 101726 37891.0 [9742 rows x 3 columns]
2,tags,userId movieId tag timestamp 0 2 60756 funny 1445714994 1 2 60756 Highly quotable 1445714996 2 2 60756 will ferrell 1445714992 3 2 89774 Boxing story 1445715207 4 2 89774 MMA 1445715200 ... ... ... ... ... 3678 606 7382 for katie 1171234019 3679 606 7936 austere 1173392334 3680 610 3265 gun fu 1493843984 3681 610 3265 heroic bloodshed 1493843978 3682 610 168248 Heroic Bloodshed 1493844270 [3683 rows x 4 columns]
3,movies,movieId title genres 0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1 2 Jumanji (1995) Adventure|Children|Fantasy 2 3 Grumpier Old Men (1995) Comedy|Romance 3 4 Waiting to Exhale (1995) Comedy|Drama|Romance 4 5 Father of the Bride Part II (1995) Comedy ... ... ... ... 9737 193581 Black Butler: Book of the Atlantic (2017) Action|Animation|Comedy|Fantasy 9738 193583 No Game No Life: Zero (2017) Animation|Comedy|Fantasy 9739 193585 Flint (2017) ...


### Connect To Database Engine - 
> * We use sqlite database engine to perform sql query
> * There are many packages to connect to sqlite engine such as - sqlite3, sqlalchemy etc.
> * We use sqlalchemy to connect to database engine. It works with many Relational Database Management Systems.
> * We will use pandas DataFrame class to represent the result of a sql query into a table form.

In [0]:
from sqlalchemy import create_engine

In [0]:
engine = create_engine("sqlite:///movie_review.sqlite")

In [13]:
# print the names of table in the database movie_review.sqlite
table_names = engine.table_names()
pd.DataFrame([table_names])

Unnamed: 0,0,1,2,3
0,links,movies,ratings,tags


> * Since there is no table in movie_review.sqlite database, we will create some tables and insert data of movie review into them.

> * https://sqlite.org/cli.html

In [0]:
# connect to database
con = engine.connect()

# create table one by one 
for table_name, table_data in db.items():    
    table_data.to_sql(table_name, con=engine, index=False,
                      if_exists='replace')        

In [15]:
# print the names of table in the database movie_review.sqlite
table_names = engine.table_names()
pd.DataFrame([table_names])

Unnamed: 0,0,1,2,3
0,links,movies,ratings,tags


## `GROUP BY` with `WHERE`

In [16]:
#@title **`HAVING` Condition - Syntax Comparison** { vertical-output: true, output-height: 250, form-width: "50%", display-mode: "form" }
dd = {'SQL' :['COUNT(movieId) >= 10', 'MAX(rating) = 10', 'MIN(rating) = 5', 'SUM(profit) > 1000',
              'AVG(profit) <= 500',  'COUNT(DISTINCT tag) = 5'],
      'Pandas' : ["lambda x: x['movieId'].count() >= 10","lambda x: x['rating'].max() = 10", "lambda x: x['rating'].min() = 5", 
                  "lambda x: x['profit'].sum() > 1000",  "lambda x: x['profit'].mean() <= 500",
 "lambda x: x['tag'].nunique() == 5"]}
pd.DataFrame(dd)

Unnamed: 0,SQL,Pandas
0,COUNT(movieId) >= 10,lambda x: x['movieId'].count() >= 10
1,MAX(rating) = 10,lambda x: x['rating'].max() = 10
2,MIN(rating) = 5,lambda x: x['rating'].min() = 5
3,SUM(profit) > 1000,lambda x: x['profit'].sum() > 1000
4,AVG(profit) <= 500,lambda x: x['profit'].mean() <= 500
5,COUNT(DISTINCT tag) = 5,lambda x: x['tag'].nunique() == 5


### Query 10: Find number of movies rated as at least 3.5 for each user.

> * **SQL** 

In [0]:
table_name = 'ratings'

In [18]:
rs = con.execute(f'''SELECT userId, COUNT(movieId) AS movieId_count
                     FROM {table_name}
                     WHERE rating >= 3.5
                     GROUP BY userId
                     ''')
pd.DataFrame(rs.fetchall(), columns=['userId', 'movieId_count']).set_index('userId').T

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610
movieId_count,200,23,17,128,23,142,88,22,22,80,41,29,18,21,78,83,103,403,105,164,280,52,85,87,26,7,82,212,74,33,36,60,99,51,17,15,16,26,71,63,118,259,102,23,275,27,65,29,21,93,263,121,20,3,12,29,239,70,88,13,35,331,183,408,30,302,31,630,41,60,20,43,156,168,41,61,19,30,54,164,6,151,73,170,23,65,18,49,309,40,329,19,90,17,143,53,29,69,35,135,39,20,327,171,674,30,23,51,27,40,393,47,92,22,73,54,59,16,204,8,21,288,56,43,288,18,11,29,130,14,48,133,7,17,175,45,116,13,23,345,101,25,37,86,8,14,11,35,18,12,33,56,51,30,26,257,10,14,49,156,29,29,5,26,32,171,103,89,249,19,77,22,10,32,19,26,481,67,43,17,27,658,39,96,26,199,199,42,20,50,58,16,26,15,109,24,21,122,204,267,87,285,33,74,22,17,12,13,31,122,78,175,72,1,77,95,98,15,271,166,296,123,45,46,47,319,92,20,39,48,18,426,84,105,38,21,30,30,249,78,67,15,26,62,3,178,107,42,802,23,22,35,45,111,11,157,13,20,16,111,42,13,153,38,75,103,48,58,15,16,20,25,41,789,300,36,14,14,118,161,11,209,15,55,28,76,62,425,18,233,27,260,3,109,30,23,20,172,15,29,76,21,39,159,507,56,299,39,88,45,6,140,178,31,13,18,110,732,35,17,26,72,46,10,157,131,43,125,10,181,104,206,6,117,18,53,59,17,310,24,40,15,47,46,48,125,23,55,21,11,113,204,42,198,17,248,323,27,44,13,58,105,15,16,99,28,130,81,76,74,40,107,40,17,24,124,108,44,15,694,333,217,21,31,82,11,571,18,26,61,243,10,87,6,14,16,19,40,28,41,55,27,26,24,116,11,17,130,84,132,46,70,48,1459,84,28,61,72,125,123,32,48,16,107,210,61,39,90,42,37,6,198,15,175,41,29,74,373,19,27,44,32,27,58,27,52,593,23,43,20,194,229,35,23,27,25,44,23,78,18,292,26,91,98,103,13,14,266,33,23,26,25,1367,153,38,480,9,85,463,7,86,509,219,16,36,28,79,340,60,58,74,40,16,213,17,28,22,22,46,13,22,57,74,25,20,6,4,249,49,49,27,24,223,26,14,107,15,24,154,22,160,74,59,368,53,139,52,9,17,12,42,32,439,9,18,38,35,29,36,36,81,66,17,12,34,15,21,13,26,101,106,79,67,274,31,21,38,45,347,281,203,128,113,16,42,82,18,15,129,23,121,258,17,14,10,81,22,50,294,39,49,28,61,58,196,136,19,29,403,24,46,53,196,17,287,321,15,480,324,101,54,563,44,96,854,111,406,10,929


> * **Pandas** 

In [19]:
table = db[table_name]

FROM_CLAUSE = table

GROUPBY_COLUMNS = ['userId']
GROUPBY_CLAUSE = pd.DataFrame.groupby
WHERE_CLAUSE = (table['rating'] >= 3.5)
SELECT_WITH_AGG = {'movieId': 'count'}

rs = GROUPBY_CLAUSE(FROM_CLAUSE[WHERE_CLAUSE], GROUPBY_COLUMNS).agg(SELECT_WITH_AGG)                                                                                    
pd.DataFrame(rs).T

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610
movieId,200,23,17,128,23,142,88,22,22,80,41,29,18,21,78,83,103,403,105,164,280,52,85,87,26,7,82,212,74,33,36,60,99,51,17,15,16,26,71,63,118,259,102,23,275,27,65,29,21,93,263,121,20,3,12,29,239,70,88,13,35,331,183,408,30,302,31,630,41,60,20,43,156,168,41,61,19,30,54,164,6,151,73,170,23,65,18,49,309,40,329,19,90,17,143,53,29,69,35,135,39,20,327,171,674,30,23,51,27,40,393,47,92,22,73,54,59,16,204,8,21,288,56,43,288,18,11,29,130,14,48,133,7,17,175,45,116,13,23,345,101,25,37,86,8,14,11,35,18,12,33,56,51,30,26,257,10,14,49,156,29,29,5,26,32,171,103,89,249,19,77,22,10,32,19,26,481,67,43,17,27,658,39,96,26,199,199,42,20,50,58,16,26,15,109,24,21,122,204,267,87,285,33,74,22,17,12,13,31,122,78,175,72,1,77,95,98,15,271,166,296,123,45,46,47,319,92,20,39,48,18,426,84,105,38,21,30,30,249,78,67,15,26,62,3,178,107,42,802,23,22,35,45,111,11,157,13,20,16,111,42,13,153,38,75,103,48,58,15,16,20,25,41,789,300,36,14,14,118,161,11,209,15,55,28,76,62,425,18,233,27,260,3,109,30,23,20,172,15,29,76,21,39,159,507,56,299,39,88,45,6,140,178,31,13,18,110,732,35,17,26,72,46,10,157,131,43,125,10,181,104,206,6,117,18,53,59,17,310,24,40,15,47,46,48,125,23,55,21,11,113,204,42,198,17,248,323,27,44,13,58,105,15,16,99,28,130,81,76,74,40,107,40,17,24,124,108,44,15,694,333,217,21,31,82,11,571,18,26,61,243,10,87,6,14,16,19,40,28,41,55,27,26,24,116,11,17,130,84,132,46,70,48,1459,84,28,61,72,125,123,32,48,16,107,210,61,39,90,42,37,6,198,15,175,41,29,74,373,19,27,44,32,27,58,27,52,593,23,43,20,194,229,35,23,27,25,44,23,78,18,292,26,91,98,103,13,14,266,33,23,26,25,1367,153,38,480,9,85,463,7,86,509,219,16,36,28,79,340,60,58,74,40,16,213,17,28,22,22,46,13,22,57,74,25,20,6,4,249,49,49,27,24,223,26,14,107,15,24,154,22,160,74,59,368,53,139,52,9,17,12,42,32,439,9,18,38,35,29,36,36,81,66,17,12,34,15,21,13,26,101,106,79,67,274,31,21,38,45,347,281,203,128,113,16,42,82,18,15,129,23,121,258,17,14,10,81,22,50,294,39,49,28,61,58,196,136,19,29,403,24,46,53,196,17,287,321,15,480,324,101,54,563,44,96,854,111,406,10,929


## `GROUP BY` with `HAVING`


### Query 11: Find all the movieIds tagged with more than 10 tags by at least one user.

> * **SQL** 

In [0]:
table_name = 'tags'

In [21]:
rs = con.execute(f'''SELECT movieId
                     FROM {table_name}
                     GROUP BY movieId, userId
                     HAVING COUNT(tag) > 10
                     ''')
pd.DataFrame(rs.fetchall(), columns=['movieId']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
movieId,293,296,924,1732,1921,2959,3676,4144,4878,5673,7361,68791,71899,79132,99114,135536


> * **Pandas** 

In [22]:
table = db[table_name]


SELECT_CLAUSE_with_DISTINCT = {'movieId': 'unique'}

FROM_CLAUSE = table

GROUPBY_COLUMNS = ['movieId', 'userId']
GROUPBY_CLAUSE = pd.DataFrame.groupby
HAVING_CLAUSE = lambda x: x['tag'].count() > 10 

rs = GROUPBY_CLAUSE(FROM_CLAUSE, GROUPBY_COLUMNS) \
                        .filter(HAVING_CLAUSE)['movieId'].unique()
                        

pd.DataFrame(rs)

Unnamed: 0,0
0,99114
1,135536
2,68791
3,79132
4,1921
5,3676
6,4144
7,4878
8,5673
9,7361


## `WHERE`, `GROUP BY`, `HAVING` - All Together


### Query 12: Find all the users who has tagged **at least two** movies with tag 'romance' or 'emotional'. Display userId and tags.

> * **SQL** 

In [0]:
table_name = 'tags'

In [24]:
rs = con.execute(f'''SELECT userId, tag
                     FROM {table_name}
                     WHERE tag = 'romance' OR tag = 'emotional' 
                     GROUP BY userId, tag
                     HAVING COUNT(movieId) >= 2
                     ''')
pd.DataFrame(rs.fetchall(), columns=['userId', 'tag'])

Unnamed: 0,userId,tag
0,62,romance
1,537,emotional
2,537,romance
3,567,emotional


> * **Pandas** 

In [25]:
table = db[table_name]
FROM_CLAUSE = table
DISTINCT = pd.DataFrame.drop_duplicates
SELECT_CLAUSE = ['userId', 'tag']
WHERE_CONDITION1 = (table['tag'] == 'romance')
WHERE_CONDITION2 = (table['tag'] == 'emotional')
WHERE_CLAUSE = (WHERE_CONDITION1 | WHERE_CONDITION2)
GROUPBY_COLUMNS = ['userId', 'tag']
GROUPBY_CLAUSE = pd.DataFrame.groupby
HAVING_CLAUSE = lambda x: x['movieId'].count() >= 2

rs = DISTINCT(GROUPBY_CLAUSE(FROM_CLAUSE[WHERE_CLAUSE], GROUPBY_COLUMNS) \
                     .filter(HAVING_CLAUSE)[SELECT_CLAUSE])

pd.DataFrame(rs)

Unnamed: 0,userId,tag
178,62,romance
2790,537,romance
2792,537,emotional
2891,567,emotional


### Query 13: Find all the movies which have been tagged by **at least two** users with tag 'suspense'.

> * **SQL** 

In [0]:
table_name = 'tags'

In [27]:
rs = con.execute(f'''SELECT  DISTINCT movieId, timestamp
                     FROM {table_name}
                     WHERE tag = 'suspense'
                     GROUP BY movieId
                     HAVING COUNT(DISTINCT userId) >= 2
                     ''')
pd.DataFrame(rs.fetchall(), columns=['unique_movieId', 'timestamp'])

Unnamed: 0,unique_movieId,timestamp
0,48516,1457843184
1,105504,1525286930


> * **Pandas** 

In [28]:
table = db[table_name]
FROM_CLAUSE = table
 
SELECT_CLAUSE = ['movieId', 'timestamp']
WHERE_CONDITION = (table['tag'] == 'suspense')
WHERE_CLAUSE = WHERE_CONDITION
GROUPBY_COLUMNS = ['movieId']
GROUPBY_CLAUSE = pd.DataFrame.groupby
HAVING_CLAUSE = lambda x: x['userId'].nunique() >= 2

rs = GROUPBY_CLAUSE(FROM_CLAUSE[WHERE_CLAUSE], GROUPBY_COLUMNS) \
                        .filter(HAVING_CLAUSE)[SELECT_CLAUSE]
                        
pd.DataFrame(rs)

Unnamed: 0,movieId,timestamp
675,48516,1348627152
902,48516,1457843184
2882,105504,1424141149
3192,105504,1525286930


### Query 14: Find number of all the movies which have been tagged by **at least two** users with tag 'suspense'.

> * **SQL** 

In [0]:
table_name = 'tags'

In [30]:
rs = con.execute(f'''SELECT SUM(movie_count) AS num_movies
                     FROM (SELECT COUNT (DISTINCT movieId) AS movie_count
                           FROM {table_name}
                           WHERE tag = 'suspense'
                           GROUP BY movieId
                           HAVING COUNT(DISTINCT userId) >= 2)''')
pd.DataFrame(rs.fetchall(), columns=['num_movies'])

Unnamed: 0,num_movies
0,2


> * **Pandas** 

In [31]:
table = db[table_name]
FROM_CLAUSE = table
 
SELECT_CLAUSE = {'movieId': 'nunique'}
WHERE_CONDITION = (table['tag'] == 'suspense')
WHERE_CLAUSE = WHERE_CONDITION
GROUPBY_COLUMNS = ['movieId']
GROUPBY_CLAUSE = pd.DataFrame.groupby
HAVING_CLAUSE = lambda x: x['userId'].nunique() >= 2

rs = GROUPBY_CLAUSE(FROM_CLAUSE[WHERE_CLAUSE], GROUPBY_COLUMNS) \
                        .filter(HAVING_CLAUSE).agg(SELECT_CLAUSE)
                        
pd.DataFrame(rs.values, columns=['num_movies'])

Unnamed: 0,num_movies
0,2


## **`count` & `nunique`**

In [32]:
#@title **Sample `tags` Data** { vertical-output: true }
df1 = pd.DataFrame({'userId': [1, 1, 2, 3, 4, 5], 'movieId': ['m1',  'm1', 'm1', 'm2', 'm2', 'm2'], 'tag': ['t1', 't2', 't3', 't4', 't4', 't3']})
df1

Unnamed: 0,userId,movieId,tag
0,1,m1,t1
1,1,m1,t2
2,2,m1,t3
3,3,m2,t4
4,4,m2,t4
5,5,m2,t3


In [33]:
#@title **unique_movie_count - `count` & `nunique`** { vertical-output: true }
rs1 = df1.groupby(['movieId']).filter(lambda x: x['userId'].count() >= 3) \
                              .agg({'movieId': 'count'}) 
rs2 = df1.groupby(['movieId']).filter(lambda x: x['userId'].nunique() >= 3) \
                              .agg({'movieId': 'count'}) 
rs3 = df1.groupby(['movieId']).filter(lambda x: x['userId'].count() >= 3) \
                              .agg({'movieId': 'nunique'}) 
rs4 = df1.groupby(['movieId']).filter(lambda x: x['userId'].nunique() >= 3) \
                              .agg({'movieId': 'nunique'}) 

pd.DataFrame({'SELECT': ['count', 'nunique', 'count', 'nunique'], 
              'HAVING': ['count', 'count', 'nunique', 'nunique'],
              'unique_movie_count': 
              [rs1.values[0], rs2.values[0], rs3.values[0], rs4.values[0]]}) \
              .set_index(['SELECT', 'HAVING'])

Unnamed: 0_level_0,Unnamed: 1_level_0,unique_movie_count
SELECT,HAVING,Unnamed: 2_level_1
count,count,6
nunique,count,3
count,nunique,2
nunique,nunique,1


In [34]:
for i, j in db.items():
    print(i)
    print(j.shape, j.drop_duplicates().shape)

ratings
(100836, 4) (100836, 4)
links
(9742, 3) (9742, 3)
tags
(3683, 4) (3683, 4)
movies
(9742, 3) (9742, 3)


In [35]:
#@title **`COUNTING` - Syntax Comparison** { vertical-output: true, output-height: 250, form-width: "50%", display-mode: "form" }
dd = {'SQL' :['COUNT (column_name)', 'COUNT (DISTINCT column_name)', 'DISTINCT column_name'],
      'Pandas' : ["df[column_name].count()","df[column_name].nunique()", "df[column_name].unique()"]}
pd.DataFrame(dd)






Unnamed: 0,SQL,Pandas
0,COUNT (column_name),df[column_name].count()
1,COUNT (DISTINCT column_name),df[column_name].nunique()
2,DISTINCT column_name,df[column_name].unique()
