### Download the Data
http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [1]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9681 sha256=6f3849127931ed0b728573e7a9fc0b45371d7cd5c2c89877243055f6655da8da
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
import wget
fn = wget.download('http://files.grouplens.org/datasets/movielens/ml-latest-small.zip')
fn

'ml-latest-small.zip'

In [3]:
!unzip ml-latest*

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


### Basic Info About Data

In [0]:
PATH = 'ml-latest-small'

In [5]:
!find $PATH -name '*.csv' | xargs wc -l | sort -nr

 124007 total
 100837 ml-latest-small/ratings.csv
   9743 ml-latest-small/movies.csv
   9743 ml-latest-small/links.csv
   3684 ml-latest-small/tags.csv


In [6]:
# find which seperator is used to seperate the columns of each csv file
!head -1 $PATH/*.csv


==> ml-latest-small/links.csv <==
movieId,imdbId,tmdbId

==> ml-latest-small/movies.csv <==
movieId,title,genres

==> ml-latest-small/ratings.csv <==
userId,movieId,rating,timestamp

==> ml-latest-small/tags.csv <==
userId,movieId,tag,timestamp


> * We can see that all files are comma-seperated.

### Read the Dataset

In [0]:
import pandas as pd
import numpy as np
import re
import os

from collections import defaultdict

In [0]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 1000)

In [0]:
db = defaultdict(pd.DataFrame)

for filename in os.listdir(PATH):
    if filename.endswith('.csv'):
        db[filename.split('.')[0]] = pd.read_csv(f"{PATH}/{filename}")
        

In [10]:
pd.DataFrame(db.items(), columns=['table_name', 'table_data'])

Unnamed: 0,table_name,table_data
0,tags,userId movieId tag timestamp 0 2 60756 funny 1445714994 1 2 60756 Highly quotable 1445714996 2 2 60756 will ferrell 1445714992 3 2 89774 Boxing story 1445715207 4 2 89774 MMA 1445715200 ... ... ... ... ... 3678 606 7382 for katie 1171234019 3679 606 7936 austere 1173392334 3680 610 3265 gun fu 1493843984 3681 610 3265 heroic bloodshed 1493843978 3682 610 168248 Heroic Bloodshed 1493844270 [3683 rows x 4 columns]
1,movies,movieId title genres 0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1 2 Jumanji (1995) Adventure|Children|Fantasy 2 3 Grumpier Old Men (1995) Comedy|Romance 3 4 Waiting to Exhale (1995) Comedy|Drama|Romance 4 5 Father of the Bride Part II (1995) Comedy ... ... ... ... 9737 193581 Black Butler: Book of the Atlantic (2017) Action|Animation|Comedy|Fantasy 9738 193583 No Game No Life: Zero (2017) Animation|Comedy|Fantasy 9739 193585 Flint (2017) ...
2,links,movieId imdbId tmdbId 0 1 114709 862.0 1 2 113497 8844.0 2 3 113228 15602.0 3 4 114885 31357.0 4 5 113041 11862.0 ... ... ... ... 9737 193581 5476944 432131.0 9738 193583 5914996 445030.0 9739 193585 6397426 479308.0 9740 193587 8391976 483455.0 9741 193609 101726 37891.0 [9742 rows x 3 columns]
3,ratings,userId movieId rating timestamp 0 1 1 4.0 964982703 1 1 3 4.0 964981247 2 1 6 4.0 964982224 3 1 47 5.0 964983815 4 1 50 5.0 964982931 ... ... ... ... ... 100831 610 166534 4.0 1493848402 100832 610 168248 5.0 1493850091 100833 610 168250 5.0 1494273047 100834 610 168252 5.0 1493846352 100835 610 170875 3.0 1493846415 [100836 rows x 4 columns]


### Connect To Database Engine - 
> * We use sqlite database engine to perform sql query
> * There are many packages to connect to sqlite engine such as - sqlite3, sqlalchemy etc.
> * We use sqlalchemy to connect to database engine. It works with many Relational Database Management Systems.
> * We will use pandas DataFrame class to represent the result of a sql query into a table form.

In [0]:
from sqlalchemy import create_engine

In [0]:
engine = create_engine("sqlite:///movie_review.sqlite")

In [13]:
# print the names of table in the database movie_review.sqlite
table_names = engine.table_names()
pd.DataFrame([table_names])

0


> * Since there is no table in movie_review.sqlite database, we will create some tables and insert data of movie review into them.

> * https://sqlite.org/cli.html

In [0]:
# connect to database
con = engine.connect()

# create table one by one 
for table_name, table_data in db.items():    
    table_data.to_sql(table_name, con=engine, index=False,
                      if_exists='replace')        

In [15]:
# print the names of table in the database movie_review.sqlite
table_names = engine.table_names()
pd.DataFrame([table_names])

Unnamed: 0,0,1,2,3
0,links,movies,ratings,tags


In [16]:
db['movies'].tail(10)

Unnamed: 0,movieId,title,genres
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi
9733,193567,anohana: The Flower We Saw That Day - The Movie (2013),Animation|Drama
9734,193571,Silver Spoon (2014),Comedy|Drama
9735,193573,Love Live! The School Idol Movie (2015),Animation
9736,193579,Jon Stewart Has Left the Building (2015),Documentary
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


### Query 15: Find all the movie titles released in year 2018

> * **SQL** 

In [0]:
table_name = 'movies'

In [18]:
rs = con.execute(f'''SELECT title
                     FROM {table_name}
                     WHERE title LIKE "%2018_"
                     ''')

pd.DataFrame(rs.fetchall(), columns=['title']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
title,Avengers: Infinity War - Part I (2018),Annihilation (2018),The Commuter (2018),Insidious: The Last Key (2018),Game Night (2018),Maze Runner: The Death Cure (2018),Isle of Dogs (2018),The Clapper (2018),Tom Segura: Disgraceful (2018),When We First Met (2018),The Cloverfield Paradox (2018),Tomb Raider (2018),Fred Armisen: Standup for Drummers (2018),Death Wish (2018),A Wrinkle in Time (2018),"Love, Simon (2018)",A Quiet Place (2018),Alpha (2018),I Kill Giants (2018),"Game Over, Man! (2018)",Blockers (2018),Pacific Rim: Uprising (2018),Rampage (2018),Jurassic World: Fallen Kingdom (2018),Incredibles 2 (2018),Deadpool 2 (2018),Solo: A Star Wars Story (2018),Won't You Be My Neighbor? (2018),Sorry to Bother You (2018),Ant-Man and the Wasp (2018),Dogman (2018),Mamma Mia: Here We Go Again! (2018),Tag (2018),The Man Who Killed Don Quixote (2018),Boundaries (2018),Spiral (2018),Mission: Impossible - Fallout (2018),SuperFly (2018),BlacKkKlansman (2018),The Darkest Minds (2018),Bungo Stray Dogs: Dead Apple (2018)


> * **Pandas** 

In [19]:
table = db[table_name]
FROM_CLAUSE = table

COLUMN = table['title']
COLUMN_LIKE = COLUMN.str.contains
PATTERN = r'(2018.)$'
WHERE_CLAUSE = COLUMN_LIKE(PATTERN)

SELECT_CLAUSE = ['title']

# db[table_name][(db[table_name]['title'].str.contains(r'(2018.)$'))][['title']]

rs = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]
pd.DataFrame(rs,  columns=['title']).T

  return func(self, *args, **kwargs)


Unnamed: 0,8693,9668,9674,9678,9681,9682,9683,9684,9685,9686,9689,9692,9695,9696,9697,9698,9699,9700,9701,9703,9704,9705,9706,9707,9708,9709,9710,9711,9712,9713,9714,9715,9716,9717,9718,9719,9720,9721,9723,9724,9740
title,Avengers: Infinity War - Part I (2018),Annihilation (2018),The Commuter (2018),Insidious: The Last Key (2018),Game Night (2018),Maze Runner: The Death Cure (2018),Isle of Dogs (2018),The Clapper (2018),Tom Segura: Disgraceful (2018),When We First Met (2018),The Cloverfield Paradox (2018),Tomb Raider (2018),Fred Armisen: Standup for Drummers (2018),Death Wish (2018),A Wrinkle in Time (2018),"Love, Simon (2018)",A Quiet Place (2018),Alpha (2018),I Kill Giants (2018),"Game Over, Man! (2018)",Blockers (2018),Pacific Rim: Uprising (2018),Rampage (2018),Jurassic World: Fallen Kingdom (2018),Incredibles 2 (2018),Deadpool 2 (2018),Solo: A Star Wars Story (2018),Won't You Be My Neighbor? (2018),Sorry to Bother You (2018),Ant-Man and the Wasp (2018),Dogman (2018),Mamma Mia: Here We Go Again! (2018),Tag (2018),The Man Who Killed Don Quixote (2018),Boundaries (2018),Spiral (2018),Mission: Impossible - Fallout (2018),SuperFly (2018),BlacKkKlansman (2018),The Darkest Minds (2018),Bungo Stray Dogs: Dead Apple (2018)


In [20]:
#@title **`LIKE` Operator Analogy** { vertical-output: true, display-mode: "both" }
res = {'SQL': ['title', 'LIKE', 'title LIKE', '"%2018_"'],
                                  'Pandas': ["table['title']", 
                                             'pd.Series.str.contains',
                                             "table['title'].str.contains",
                                             "r'(2018.)$'"
                                             ]}
pd.DataFrame(res)

Unnamed: 0,SQL,Pandas
0,title,table['title']
1,LIKE,pd.Series.str.contains
2,title LIKE,table['title'].str.contains
3,"""%2018_""",r'(2018.)$'


### Query 16: Find all the movies titles released in year 2018 and from at least 'Action' genre.

> * **SQL** 

In [0]:
table_name = 'movies'

In [22]:
rs = con.execute(f'''SELECT title
                     FROM {table_name}
                     WHERE title LIKE "%2018_" AND genres LIKE "%ACTION%"
                     ''')

pd.DataFrame(rs.fetchall(), columns=['title']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
title,Avengers: Infinity War - Part I (2018),Game Night (2018),Maze Runner: The Death Cure (2018),Tomb Raider (2018),Death Wish (2018),"Game Over, Man! (2018)",Pacific Rim: Uprising (2018),Rampage (2018),Jurassic World: Fallen Kingdom (2018),Incredibles 2 (2018),Deadpool 2 (2018),Solo: A Star Wars Story (2018),Ant-Man and the Wasp (2018),Mission: Impossible - Fallout (2018),SuperFly (2018),Bungo Stray Dogs: Dead Apple (2018)


> * **Pandas** 

In [23]:
table = db[table_name]
FROM_CLAUSE = table

COLUMN1 = table['title']
COLUMN2 = table['genres']
COLUMN_LIKE1 = COLUMN1.str.contains
COLUMN_LIKE2 = COLUMN2.str.contains

PATTERN1 = r'(2018.)$'
PATTERN2 = r'Action'
WHERE_CLAUSE = (COLUMN_LIKE1(PATTERN1) & COLUMN_LIKE2(PATTERN2))

SELECT_CLAUSE = ['title']

# db[table_name][(db[table_name]['title'].str.contains(r'(2018.)$') & \
#                 db[table_name]['title'].str.contains( r'Action'))] \
#               [['title']]

rs = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]
pd.DataFrame(rs,  columns=['title']).T

  return func(self, *args, **kwargs)


Unnamed: 0,8693,9681,9682,9692,9696,9703,9705,9706,9707,9708,9709,9710,9713,9720,9721,9740
title,Avengers: Infinity War - Part I (2018),Game Night (2018),Maze Runner: The Death Cure (2018),Tomb Raider (2018),Death Wish (2018),"Game Over, Man! (2018)",Pacific Rim: Uprising (2018),Rampage (2018),Jurassic World: Fallen Kingdom (2018),Incredibles 2 (2018),Deadpool 2 (2018),Solo: A Star Wars Story (2018),Ant-Man and the Wasp (2018),Mission: Impossible - Fallout (2018),SuperFly (2018),Bungo Stray Dogs: Dead Apple (2018)


> * **Points to be remembered**
1. In *SQLite*,  the LIKE operator is case sensitive by default for Unicode characters that are beyond the ASCII range. For example, the expression 'a' LIKE 'A' is TRUE but 'æ' LIKE 'Æ' is FALSE.)
2. The GLOB operator is similar to LIKE but uses the Unix file globbing syntax for its wildcards. Also, GLOB is case sensitive, unlike LIKE. Both GLOB and LIKE may be preceded by the NOT keyword to invert the sense of the test.

> https://www.w3resource.com/sqlite/like-operator.php 

### Query 17: Find all the movies titles without year and their genres released in year 2018 and NOT from 'Action' genre.

> * **SQL** 

In [0]:
table_name = 'movies'

In [25]:
rs = con.execute(f'''SELECT TRIM(REPLACE(title, "(2018)", "")), genres
                     FROM {table_name}
                     WHERE title LIKE "%2018_" AND genres NOT LIKE "%Action%"
                     ''')

pd.DataFrame(rs.fetchall(), columns=['title', 'genres']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
title,Annihilation,The Commuter,Insidious: The Last Key,Isle of Dogs,The Clapper,Tom Segura: Disgraceful,When We First Met,The Cloverfield Paradox,Fred Armisen: Standup for Drummers,A Wrinkle in Time,"Love, Simon",A Quiet Place,Alpha,I Kill Giants,Blockers,Won't You Be My Neighbor?,Sorry to Bother You,Dogman,Mamma Mia: Here We Go Again!,Tag,The Man Who Killed Don Quixote,Boundaries,Spiral,BlacKkKlansman,The Darkest Minds
genres,Adventure|Mystery|Sci-Fi|Thriller,Crime|Drama|Mystery|Thriller,Horror|Mystery|Thriller,Animation|Comedy,Comedy,Comedy,Comedy,Horror|Mystery|Sci-Fi|Thriller,Comedy,Adventure|Children|Fantasy|Sci-Fi,Comedy|Drama,Drama|Horror|Thriller,Adventure|Thriller,Drama|Fantasy|Thriller,Comedy,Documentary,Comedy|Fantasy|Sci-Fi,Crime|Drama,Comedy|Romance,Comedy,Adventure|Comedy|Fantasy,Comedy|Drama,Documentary,Comedy|Crime|Drama,Sci-Fi|Thriller


In [26]:
db['movies'].head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


> * **Pandas** 

In [27]:
table = db[table_name]
FROM_CLAUSE = table

COLUMN1, COLUMN2 = table['title'], table['genres'] 
COLUMN_LIKE1, COLUMN_LIKE2 = COLUMN1.str.contains, COLUMN2.str.contains

PATTERN1, PATTERN2 = r'(2018.)$',  r'Action'

WHERE_CONDITION1, WHERE_CONDITION2 = COLUMN_LIKE1(PATTERN1), ~ COLUMN_LIKE2(PATTERN2)
WHERE_CLAUSE = (WHERE_CONDITION1 & WHERE_CONDITION2)
SELECT_CLAUSE = ['title', 'genres']

rs = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]
title_without_year = rs['title'].str.replace(r'(.2018.)$', '').str.strip(' ')
genres = rs['genres']

"""rs = db['movies'][(db['movies']['title'].str.contains(PATTERN1)) & \
                    (~ db['movies']['genres'].str.contains(PATTERN2))]
df = pd.DataFrame([rs['title'].str.replace(r'(.2018.)$', '').str.strip(' '),
                rs['genres']])"""

pd.DataFrame([title_without_year, genres])

  return func(self, *args, **kwargs)


Unnamed: 0,9668,9674,9678,9683,9684,9685,9686,9689,9695,9697,9698,9699,9700,9701,9704,9711,9712,9714,9715,9716,9717,9718,9719,9723,9724
title,Annihilation,The Commuter,Insidious: The Last Key,Isle of Dogs,The Clapper,Tom Segura: Disgraceful,When We First Met,The Cloverfield Paradox,Fred Armisen: Standup for Drummers,A Wrinkle in Time,"Love, Simon",A Quiet Place,Alpha,I Kill Giants,Blockers,Won't You Be My Neighbor?,Sorry to Bother You,Dogman,Mamma Mia: Here We Go Again!,Tag,The Man Who Killed Don Quixote,Boundaries,Spiral,BlacKkKlansman,The Darkest Minds
genres,Adventure|Mystery|Sci-Fi|Thriller,Crime|Drama|Mystery|Thriller,Horror|Mystery|Thriller,Animation|Comedy,Comedy,Comedy,Comedy,Horror|Mystery|Sci-Fi|Thriller,Comedy,Adventure|Children|Fantasy|Sci-Fi,Comedy|Drama,Drama|Horror|Thriller,Adventure|Thriller,Drama|Fantasy|Thriller,Comedy,Documentary,Comedy|Fantasy|Sci-Fi,Crime|Drama,Comedy|Romance,Comedy,Adventure|Comedy|Fantasy,Comedy|Drama,Documentary,Comedy|Crime|Drama,Sci-Fi|Thriller


In [28]:
res = {'SQL': ['REPLACE()', 'TRIM()', 'NOT LIKE'],
       'Pandas': ["pd.Series.str.replace()", 
                  'pd.Series.str.strip()', "~ pd.Series.str.contains()" ]}
                                           
pd.DataFrame(res)

Unnamed: 0,SQL,Pandas
0,REPLACE(),pd.Series.str.replace()
1,TRIM(),pd.Series.str.strip()
2,NOT LIKE,~ pd.Series.str.contains()


### Query 18: Find all the releases of 'Pirates of the Caribbean' using `INSTR()`

> * **SQL** 

In [0]:
table_name = 'movies'

In [30]:
rs = con.execute(f'''SELECT title
                     FROM {table_name}         
                     WHERE INSTR(LOWER(title), 
                           LOWER("Pirates of the Caribbean")) > 0                 
                     ''')

pd.DataFrame(rs.fetchall(), columns=['title'])

Unnamed: 0,title
0,Pirates of the Caribbean: The Curse of the Black Pearl (2003)
1,Pirates of the Caribbean: Dead Man's Chest (2006)
2,Pirates of the Caribbean: At World's End (2007)
3,Pirates of the Caribbean: On Stranger Tides (2011)
4,Pirates of the Caribbean: Dead Men Tell No Tales (2017)


In [31]:
rs = con.execute(f'''SELECT INSTR('SQLite Tutorial','lutorial') position''').fetchall()
rs

[(0,)]

> * **Pandas** 

In [32]:
table = db[table_name]
FROM_CLAUSE = table

COLUMN = table['title']
COLUMN_LOWER = COLUMN.str.lower()
COLUMN_LOWER_INSTR = COLUMN_LOWER.str.find
LOWER_STRING = 'Pirates of the Caribbean'.lower()

WHERE_CLAUSE = (COLUMN_LOWER_INSTR(LOWER_STRING) >= 0)

SELECT_CLAUSE = ['title']

# db['movies'][db['movies']['title'].str.lower().str.find( \
#                    'Pirates of the Caribbean'.lower()) >= 0][['title']]
rs = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]

pd.DataFrame(rs)

Unnamed: 0,title
4427,Pirates of the Caribbean: The Curse of the Black Pearl (2003)
6221,Pirates of the Caribbean: Dead Man's Chest (2006)
6488,Pirates of the Caribbean: At World's End (2007)
7608,Pirates of the Caribbean: On Stranger Tides (2011)
8687,Pirates of the Caribbean: Dead Men Tell No Tales (2017)


* **Certain Points to be Considered**
    1. We can get index of a substring using both pd.Series.str.index() and pd.Series.str.find(). The difference is that the former will return **ValueError: substring not found** and later will return **-1** if substring is not found.
    2. SQL index will start from 1 and Pandas index will start from 0.
    > **SQL** - `INSTR('ABC', 'A')` -> 1

    > **Python** - `pd.Series(['ABC']).str.index('A')` -> 0

### Query 19: Find all the tag of length greater than 40 using `LENGTH`

> * **SQL** 

In [0]:
table_name = 'tags'

In [34]:
rs = con.execute(f'''SELECT DISTINCT tag, LENGTH(tag) AS length
                     FROM {table_name} 
                     WHERE LENGTH(tag) > 40 
                     ORDER BY length DESC
                     ''')

pd.DataFrame(rs.fetchall(), columns=['tag', 'length'])

Unnamed: 0,tag,length
0,Something for everyone in this one... saw it without and plan on seeing it with kids!,85
1,the catholic church is the most corrupt organization in history,63
2,villain nonexistent or not needed for good story,48
3,r:disturbing violent content including rape,43
4,06 Oscar Nominated Best Movie - Animation,41


> * **Pandas** 

In [35]:
table = db[table_name]
FROM_CLAUSE = table

WHERE_CONDITION = (table['tag'].str.len() > 40)
WHERE_CLAUSE = WHERE_CONDITION

DISTINCT = pd.Series.unique

tag = pd.Series(DISTINCT(FROM_CLAUSE[WHERE_CLAUSE]['tag']))

# find the length of each tag
tag_len = tag.str.len()
rs = pd.DataFrame({'tag': tag, 'length': tag_len})

# sorting
ORDER_BY = pd.DataFrame.sort_values
rs = ORDER_BY(rs, 'length', ascending=False)
rs

Unnamed: 0,tag,length
0,Something for everyone in this one... saw it without and plan on seeing it with kids!,85
1,the catholic church is the most corrupt organization in history,63
3,villain nonexistent or not needed for good story,48
4,r:disturbing violent content including rape,43
2,06 Oscar Nominated Best Movie - Animation,41


In [36]:
tag = pd.Series.unique(db['tags'][db['tags']['tag'].str.len() > 40]['tag'])
tag_len = list(map(len, tag))
pd.DataFrame({'tag': tag, 'length': tag_len}).sort_values('length', 
                                                          ascending=False)

Unnamed: 0,tag,length
0,Something for everyone in this one... saw it without and plan on seeing it with kids!,85
1,the catholic church is the most corrupt organization in history,63
3,villain nonexistent or not needed for good story,48
4,r:disturbing violent content including rape,43
2,06 Oscar Nominated Best Movie - Animation,41


### Query 20: Display all parts of 'Mission Impossible' movie series. - `SUBSTR`

> * **SQL** 

In [0]:
table_name = 'movies'

In [38]:
# -(SELECT MAX(LENGTH(title)))+6
rs = con.execute(f'''SELECT SUBSTR(title, -6, (SELECT 6-MAX(LENGTH(title)) FROM {table_name}))
                    FROM {table_name} ''')
pd.DataFrame(rs.fetchall())

Unnamed: 0,0
0,Toy Story
1,Jumanji
2,Grumpier Old Men
3,Waiting to Exhale
4,Father of the Bride Part II
...,...
9737,Black Butler: Book of the Atlantic
9738,No Game No Life: Zero
9739,Flint
9740,Bungo Stray Dogs: Dead Apple


In [39]:
rs = con.execute(f'''SELECT SUBSTR(title, LENGTH("Mission: Impossible")+1)
                     FROM {table_name}    
                     WHERE title LIKE "Mission: Impossible%"   
                     ''')

pd.DataFrame(rs.fetchall(), columns=['part'])

Unnamed: 0,part
0,(1996)
1,II (2000)
2,III (2006)
3,- Ghost Protocol (2011)
4,- Rogue Nation (2015)
5,- Fallout (2018)


> * **Pandas** 

In [40]:
table = db[table_name]
FROM_CLAUSE = table
SELECT_CLAUSE = 'title'

WHERE_CONDITION = (table['title'].str.contains("Mission: Impossible"))
WHERE_CLAUSE = WHERE_CONDITION

title = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]

# find the part of the movie
parts = title.str.slice(len("Mission: Impossible"))
pd.DataFrame({'part': parts})

Unnamed: 0,part
546,(1996)
2700,II (2000)
6199,III (2006)
7774,- Ghost Protocol (2011)
8439,- Rogue Nation (2015)
9720,- Fallout (2018)



---
---

In [41]:
#@title **SQL and Pandas - Syntax Analogy** { vertical-output: true, display-mode: "both" }
res = {'SQL': ['INSTR', 'LENGTH', 'SUBSTR'],
                                  'Pandas': ["pd.Series.find()", 
                                             'pd.Series.str.len()',
                                             "pd.Series.str.slice()"                                             
                                             ]}
pd.DataFrame(res)

Unnamed: 0,SQL,Pandas
0,INSTR,pd.Series.find()
1,LENGTH,pd.Series.str.len()
2,SUBSTR,pd.Series.str.slice()


### Query 21:  Find all the movies title that `starts with` 'No '.

> * **SQL** 

In [0]:
table_name = 'movies'

In [43]:
rs = con.execute(f'''SELECT title
                     FROM {table_name}    
                     WHERE title LIKE "No %"   
                     ''')

pd.DataFrame(rs.fetchall(), columns=['title']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
title,No Escape (1994),No Small Affair (1984),No Mercy (1986),No Way Out (1987),No Man's Land (1987),No Holds Barred (1989),No Man's Land (2001),No Such Thing (2001),No Direction Home: Bob Dylan (2005),No Reservations (2007),No End in Sight (2007),No Country for Old Men (2007),No Strings Attached (2011),No Way Jose (2015),No Game No Life: Zero (2017)


> * **Pandas** 

In [44]:
table = db[table_name]
FROM_CLAUSE = table
SELECT_CLAUSE = ['title']

WHERE_CONDITION = (table['title'].str.startswith("No "))
WHERE_CLAUSE = WHERE_CONDITION

# db['movies'][db['movies']['title'].str.startswith('No ')][['title']].T

res = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]
res.T

Unnamed: 0,439,1677,2062,2620,3083,3404,3618,3772,6022,6532,6565,6613,7522,8897,9738
title,No Escape (1994),No Small Affair (1984),No Mercy (1986),No Way Out (1987),No Man's Land (1987),No Holds Barred (1989),No Man's Land (2001),No Such Thing (2001),No Direction Home: Bob Dylan (2005),No Reservations (2007),No End in Sight (2007),No Country for Old Men (2007),No Strings Attached (2011),No Way Jose (2015),No Game No Life: Zero (2017)


In [45]:
table = db[table_name]
FROM_CLAUSE = table
SELECT_CLAUSE = ['title']

WHERE_CONDITION = (table['title'].str.contains("^(No )"))
WHERE_CLAUSE = WHERE_CONDITION
res = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]
res.T

  return func(self, *args, **kwargs)


Unnamed: 0,439,1677,2062,2620,3083,3404,3618,3772,6022,6532,6565,6613,7522,8897,9738
title,No Escape (1994),No Small Affair (1984),No Mercy (1986),No Way Out (1987),No Man's Land (1987),No Holds Barred (1989),No Man's Land (2001),No Such Thing (2001),No Direction Home: Bob Dylan (2005),No Reservations (2007),No End in Sight (2007),No Country for Old Men (2007),No Strings Attached (2011),No Way Jose (2015),No Game No Life: Zero (2017)


### Query 22: Find all the tags that `ends with` 'ood'.

> * **SQL** 

In [0]:
table_name = 'tags'

In [47]:
rs = con.execute(f'''SELECT tag
                     FROM {table_name}    
                     WHERE tag LIKE "%ood"   
                     ''')
pd.DataFrame(rs.fetchall(), columns=['tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
tag,feel-good,Hollywood,Hollywood,food,food,motherhood,fatherhood,Hollywood,sisterhood,motherhood,flood,fatherhood,Food,food,parenthood,fatherhood,motherhood,fatherhood,feel-good,feel-good,feel-good,good,blood


> * **Pandas** 

In [48]:
table = db[table_name]
FROM_CLAUSE = table
SELECT_CLAUSE = ['tag']

WHERE_CONDITION = (table['tag'].str.endswith("ood"))    
WHERE_CLAUSE = WHERE_CONDITION
res = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]
res.T

Unnamed: 0,493,992,1216,1254,1634,1653,1747,1935,1999,2064,2152,2172,2190,2352,2384,2418,2431,2441,3074,3188,3202,3325,3402
tag,feel-good,Hollywood,Hollywood,food,food,motherhood,fatherhood,Hollywood,sisterhood,motherhood,flood,fatherhood,Food,food,parenthood,fatherhood,motherhood,fatherhood,feel-good,feel-good,feel-good,good,blood


In [49]:
table = db[table_name]
FROM_CLAUSE = table
SELECT_CLAUSE = ['tag']

WHERE_CONDITION = (table['tag'].str.contains("(ood)$"))
WHERE_CLAUSE = WHERE_CONDITION
res = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]
res.T

  return func(self, *args, **kwargs)


Unnamed: 0,493,992,1216,1254,1634,1653,1747,1935,1999,2064,2152,2172,2190,2352,2384,2418,2431,2441,3074,3188,3202,3325,3402
tag,feel-good,Hollywood,Hollywood,food,food,motherhood,fatherhood,Hollywood,sisterhood,motherhood,flood,fatherhood,Food,food,parenthood,fatherhood,motherhood,fatherhood,feel-good,feel-good,feel-good,good,blood


### Query 23:  Find all the movies title that `ends with` 'wood'.

> * **SQL** 
> * In SQLite, we don't have any builtin function like REGEX_REPLACE in Postgresql. Hence we cannot perform this query in sqlite.

> * **Pandas** 

In [0]:
table_name = 'movies'

In [51]:
table = db[table_name]
FROM_CLAUSE = table
SELECT_CLAUSE = ['title']

WHERE_CONDITION = (table['title'].str.slice(start=0, stop=-7).str.endswith("wood"))
WHERE_CLAUSE = WHERE_CONDITION
# db['movies'][db['movies']['title'].str.slice(start=0, stop=-7).str.endswith('wood')][['title']].T
res = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]
res.T

Unnamed: 0,416,1123,1998,3518,3995,4520,4770,8762,8936
title,Jimmy Hollywood (1994),Rosewood (1997),It Came from Hollywood (1982),Silkwood (1983),Welcome to Collinwood (2002),Bollywood/Hollywood (2002),Doc Hollywood (1991),Wyrmwood (2015),Scooby-Doo Goes Hollywood (1979)


In [52]:
db['movies'][db['movies']['title'].str.slice(start=0, stop=-7).str.endswith('wood')][['title']].T

Unnamed: 0,416,1123,1998,3518,3995,4520,4770,8762,8936
title,Jimmy Hollywood (1994),Rosewood (1997),It Came from Hollywood (1982),Silkwood (1983),Welcome to Collinwood (2002),Bollywood/Hollywood (2002),Doc Hollywood (1991),Wyrmwood (2015),Scooby-Doo Goes Hollywood (1979)


### Query 24: Find all the tags that `starts with` 'g' and `ends with` 'ing'.

> * **SQL** 

In [0]:
table_name = 'tags'

In [54]:
rs = con.execute(f'''SELECT tag
                     FROM {table_name}    
                     WHERE tag LIKE "g%ing"   
                     ''')
pd.DataFrame(rs.fetchall(), columns=['tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
tag,good writing,great ending,great ending,gambling,golfing,gambling,gambling,gambling,Gambling,gambling,great acting,great acting,great acting


> * **Pandas** 

In [55]:
table = db[table_name]
FROM_CLAUSE = table
SELECT_CLAUSE = ['tag']

STARTSWITH_CONDITION = (db['tags']['tag'].str.startswith('g'))
ENDSWITH_CONDITION = (db['tags']['tag'].str.endswith('ing'))

WHERE_CLAUSE = (STARTSWITH_CONDITION & ENDSWITH_CONDITION)
# db['tags'][(db['tags']['tag'].str.startswith('g')) & (db['tags']['tag'].str.endswith('ing'))][['tag']].T
res = FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE]
res.T

Unnamed: 0,373,457,687,1098,1651,1688,1703,1709,2122,3361,3455,3642
tag,good writing,great ending,great ending,gambling,golfing,gambling,gambling,gambling,gambling,great acting,great acting,great acting


In [56]:
db['tags'][db['tags']['tag'].str.contains(r'^(g)([\w\d\s])*(ing)$')][['tag']].T

  return func(self, *args, **kwargs)


Unnamed: 0,373,457,687,1098,1651,1688,1703,1709,2122,3361,3455,3642
tag,good writing,great ending,great ending,gambling,golfing,gambling,gambling,gambling,gambling,great acting,great acting,great acting


In [57]:
#@title **SQL and Pandas - Syntax Analogy** { vertical-output: true, display-mode: "both" }
res = {'SQL': ['INSTR', 'LENGTH', 'SUBSTR', 'LIKE', 'title LIKE', '"%2018_"'],
                                  'Pandas': ["pd.Series.find()", 
                                             "pd.Series.str.len()",
                                             "pd.Series.str.slice()",
                                             "pd.Series.str.contains()",
                                             "table['title'].str.contains()",
                                             "r'(2018.)$'"                                             
                                             ]}
pd.DataFrame(res)

Unnamed: 0,SQL,Pandas
0,INSTR,pd.Series.find()
1,LENGTH,pd.Series.str.len()
2,SUBSTR,pd.Series.str.slice()
3,LIKE,pd.Series.str.contains()
4,title LIKE,table['title'].str.contains()
5,"""%2018_""",r'(2018.)$'


In [58]:
#@title **SQL and Pandas - LIKE Operator - Syntax Analogy** { vertical-output: true, display-mode: "both" }
res = {'SQL': ["LIKE 'No %'", "LIKE '%ood'", "LIKE 'g%ing'", "LIKE 'g%ing'", "LIKE 'r____s'"],
                                  'Pandas': ["pd.Series.startswith('No ')", 
                                             "pd.Series.endswith('ood')",
                                             "(pd.Series.startswith('g') & (pd.Series.endswith('ing'))",
                                             "pd.Series.str.contains(r'^(g)([\w\d\s])*(ing)$')",
                                             "pd.Series.str.contains(r'^r....s$')"                                                                                       
                                             ]}
pd.DataFrame(res)

Unnamed: 0,SQL,Pandas
0,LIKE 'No %',pd.Series.startswith('No ')
1,LIKE '%ood',pd.Series.endswith('ood')
2,LIKE 'g%ing',(pd.Series.startswith('g') & (pd.Series.endswith('ing'))
3,LIKE 'g%ing',pd.Series.str.contains(r'^(g)([\w\d\s])*(ing)$')
4,LIKE 'r____s',pd.Series.str.contains(r'^r....s$')


In [59]:
#@title **SQL and Pandas - `Wildcard Characters`** { vertical-output: true, display-mode: "both" }
res = {'SQL': ["%", "_",  "A__", "__Z", "ESCAPE", "[ ]", "! or ^", "-"],
                                  'Pandas': ["([\w\d\s]*)", 
                                             ".",
                                             "^A__",
                                             "__Z$",
                                             "\\",
                                             "[ ]",
                                             "~ or ?! [lookahead negation]" ,    
                                             "-"                                                                                                                          
                                             ],
       'Description': ['zero or more occurrence of any characters',
                       'exactly one occurrence of any characters',
                       'Any strings which starts with letter A, followed by two characteres',
                       'Any strings which ends with letter Z, preceded by two characteres',
                       'Escapes the meaning of the letter written after this operator',
                       'Represents a set of characters and matches if any of the characters present in the input  string',
                       'Negation Operator',
                       'Hyphen represents a range of character'
                       ]}
pd.DataFrame(res)

Unnamed: 0,SQL,Pandas,Description
0,%,([\w\d\s]*),zero or more occurrence of any characters
1,_,.,exactly one occurrence of any characters
2,A__,^A__,"Any strings which starts with letter A, followed by two characteres"
3,__Z,__Z$,"Any strings which ends with letter Z, preceded by two characteres"
4,ESCAPE,\,Escapes the meaning of the letter written after this operator
5,[ ],[ ],Represents a set of characters and matches if any of the characters present in the input string
6,! or ^,~ or ?! [lookahead negation],Negation Operator
7,-,-,Hyphen represents a range of character


### Query 25: Find all the tags that starts with 'r' and ends with 's', and having 11 characters in between. Group by tag and get the count of each group. Search should be case insensitive.

> * **SQL** 

In [60]:
table_name = 'tags'
rs = con.execute(f'''SELECT tag, COUNT(tag)
                     FROM {table_name}    
                     WHERE tag LIKE "r___________s"   
                     GROUP BY LOWER(tag)
                     ''')
pd.DataFrame(rs.fetchall(), columns=['tag', 'count'])

Unnamed: 0,tag,count
0,Ralph Fiennes,1
1,Ryan Reynolds,5


In [61]:
table_name = 'tags'
rs = con.execute(f'''SELECT tag, COUNT(tag)
                     FROM {table_name}    
                     WHERE tag LIKE "r%"  AND tag LIKE "%s" AND LENGTH(tag) = 11 + 2  
                     GROUP BY tag
                     ''')
pd.DataFrame(rs.fetchall(), columns=['tag', 'count'])

Unnamed: 0,tag,count
0,Ralph Fiennes,1
1,Ryan Reynolds,4
2,ryan reynolds,1


In [62]:
table_name = 'tags'
rs = con.execute(f'''SELECT tag, COUNT(tag)
                     FROM {table_name}    
                     WHERE tag LIKE "r%"  AND tag LIKE "%s" AND LENGTH(tag) = 11 + 2  
                     GROUP BY LOWER (tag)
                     ''')
pd.DataFrame(rs.fetchall(), columns=['tag', 'count'])

Unnamed: 0,tag,count
0,Ralph Fiennes,1
1,Ryan Reynolds,5


> * **Pandas** 

In [63]:
db['tags'][(db['tags']['tag'].str.lower().str.startswith('r')) & 
           (db['tags']['tag'].str.lower().str.endswith('s')) &
           (db['tags']['tag'].str.len() == 13)][['tag']].apply(lambda x: x.str.lower()).groupby(by=['tag']).agg({'tag': 'count'})

Unnamed: 0_level_0,tag
tag,Unnamed: 1_level_1
ralph fiennes,1
ryan reynolds,5


In [64]:
table = db[table_name]
FROM_CLAUSE = table
SELECT_CLAUSE = ['tag']

COLUMN = db['tags']['tag']
COLUMN_LOWER = COLUMN.str.lower()

STARTSWITH_CONDITION = (COLUMN_LOWER.str.startswith('r'))
ENDSWITH_CONDITION = (COLUMN_LOWER.str.endswith('s'))
LENGTH_CONDITION = (COLUMN_LOWER.str.len() == 13)

GROUPBY = pd.DataFrame.groupby
GROUPED_COLUMNS = ['tag']
AGG_OP = {'tag': 'count'}

WHERE_CLAUSE = (STARTSWITH_CONDITION & ENDSWITH_CONDITION & LENGTH_CONDITION)
res = GROUPBY(FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE].apply(lambda x: x.str.lower()), by=GROUPED_COLUMNS).agg(AGG_OP)
res

Unnamed: 0_level_0,tag
tag,Unnamed: 1_level_1
ralph fiennes,1
ryan reynolds,5


In [65]:
table = db[table_name]
FROM_CLAUSE = table
SELECT_CLAUSE = ['tag']

COLUMN = db['tags']['tag']
COLUMN_LIKE = COLUMN.str.contains
PATTERN = r"^[rR]([\w\d\s]){11}[s|S]$"

WHERE_CLAUSE = COLUMN_LIKE(PATTERN)

GROUPBY = pd.DataFrame.groupby
GROUPED_COLUMNS = ['tag']
AGG_OP = {'tag': 'count'}

res = GROUPBY(FROM_CLAUSE[WHERE_CLAUSE][SELECT_CLAUSE].apply(lambda x: x.str.lower()), by=GROUPED_COLUMNS).agg(AGG_OP)
res

  return func(self, *args, **kwargs)


Unnamed: 0_level_0,tag
tag,Unnamed: 1_level_1
ralph fiennes,1
ryan reynolds,5
