In [1]:
from pymongo import MongoClient
client = MongoClient()
client


MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [2]:
client.list_database_names()

['admin', 'config', 'formula_1', 'local']

In [3]:
import json as J
import os

def _load_nosql(database_dir):
    dbname = os.path.basename(database_dir.rstrip("/"))
    db = client[dbname]
    
    for path in os.listdir(database_dir):
        if path.endswith('.json'):
            collection_name = path.split(".")[0]
            if collection_name not in db.list_collection_names():
                collection = db[collection_name]

                collection_objs = J.load(open(os.path.join(database_dir, path)))
                collection.insert_many(collection_objs) 
    return db


def _documents_print(cursor):
    cursor_list = list(cursor)
    for d in cursor_list:
        for k, v in d.items():
            print(f"{k} : {v}", end=" | ")
        print("\n" + ("*" + "-"*25 + "*").center(125, " "))

def _classify_integer_field(values):
    """
    Classify whether an integer field is ordinal/nominal or continuous.
    """
    unique_values = set(values)
    if (len(unique_values) == len(values)) or (len(unique_values) <= 0.25*len(values)):  # Threshold can be adjusted
        return f"{_ordinal_nominal(values)}-cat"
    else:
        return "num"

def _uniqueness(values):
    unique_values = set(values)
    if len(unique_values) == len(values):
        return "unique"
    else:
        return "non-unique"

def _ordinal_nominal(values):
    if len(values) == 1:
        return 'ordinal'
    
    vsorted =  sorted(values)   
    vmin, vmax, d, n = vsorted[0], vsorted[-1], vsorted[1]-vsorted[0], len(values)-1

    return ('ordinal' if (n*d == vmax-vmin) else 'nominal')


from collections import defaultdict 
def _get_schema(db, collection_name):

    if os.path.exists(f'./NoSQL/{db.name}/schemas/{collection_name}.json'):
        return dict(J.load(open(f'./NoSQL/{db.name}/schemas/{collection_name}.json')))
    else:
        os.makedirs(f'./NoSQL/{db.name}/schemas', exist_ok=True)
    
    schema = defaultdict(lambda: [0, '', '', 'unique', 'non-null', -1, -1])  #count data-type, conpt-type, uniqueness, nullity, minvalue, maxvalue
    collection = db[collection_name]
    
    int_values = defaultdict(list)
    key_values = defaultdict(list)
   
    # Sample documents to infer schema
    sample_docs = collection.find()
    
    for doc in sample_docs:
        for key, value in doc.items():
            field_type = type(value).__name__
            schema[key][0] += 1
            if schema[key][1] in ['', 'NoneType']:
                schema[key][1] = field_type
            
            if field_type == 'NoneType':
                schema[key][4] = 'null'
            
            if field_type == 'int':
                int_values[key].append(value)
            
            key_values[key].append(value)
    
    for key, values in int_values.items():
        classification = _classify_integer_field(values)
        schema[key][2] = classification # = schema[key].pop('int')
        
    
    for key, values in key_values.items():
        uniqueness_value = _uniqueness(values)
        
        schema[key][3] = uniqueness_value  # = schema[key].pop('int')

        if schema[key][1] in ['str', 'dict', 'ObjectId']:
                schema[key][2] = 'cat'    
        elif schema[key][1] in ['datetime', 'float']:
                schema[key][2] = 'num'
                schema[key][5], schema[key][6] = min(values), max(values)
        elif schema[key][1] in ['NoneType']:
                schema[key][2] = 'null'
        else:
            schema[key][5], schema[key][6] = min(values), max(values)
            pass 

    
    J.dump(schema, open(f'./NoSQL/{db.name}/schemas/{collection_name}.json', 'w'))
    return schema

def _describe_database_schema(db):
    database_schema = {}
    print(f"Database : {db.name}")
    for collection_name in db.list_collection_names():
        #print(f"Describing schema for collection: {collection_name}")
        schema = _get_schema(db, collection_name)
        database_schema[collection_name] = schema
    
    return database_schema

def _schema_print(schema_description):
    # Print the schema
    for collection, schema in schema_description.items():
        print(f"Collection: {collection}")
        for field, data_list in schema.items():
            print(f"  Field: {field}")
            count, typee, zone, uniqueness, nullity, minval, maxval = data_list
            print(f"    Data-Type: {typee}, Count: {count}, Concpt-Type: {zone}, Unique: {True if uniqueness == 'unique' else False}, Null: {True if nullity == 'null' else False} " + (f"Min: {minval}, Max: {maxval}" if zone=='num' else ""))

def data_ingest(file_path='NoSQL/formula_1'):
    _load_nosql(file_path)
    return 

def data_explore(file_path='NoSQL/formula_1'):
    db = _load_nosql(file_path)
    schema_description = _describe_database_schema(db)
    _schema_print(schema_description)
    return schema_description

import random

def generate_sql(schema_description):
    order_queries = defaultdict(lambda : defaultdict(set))
    for collection in schema_description:
        query_pattern = {'cmd' : ['SELECT'],
                        'cat' : [""], 'num':[""], 'nom-cat':[""], 'ord-cat':[""], 
                        'agg': ['SUM', 'COUNT', 'AVG', 'MIN', 'MAX', 'STDDEV', 'VARIANCE', 'MEDIAN'], 
                        'clause':['WHERE', 'GROUP BY', 'ORDER BY'],
                        'numeric_op' : ['>', '<', '>=', '<=', '='] }
        
        for col in schema_description[collection]:
            _type = schema_description[collection][col][2]
            
            if _type  == 'cat':
                query_pattern['cat'] += [col]
            elif _type  == 'nominal-cat':
                query_pattern['nom-cat'] += [col]
            elif _type  == 'ordinal-cat':
                query_pattern['ord-cat'] += [col]
            else:
                query_pattern['num'] += [col]
        
       
        n = random.randint(10, 15)
        _ = 0

        

        while _ < n:
            query = ""
            cmd = random.choice(query_pattern['cmd'])
            query += cmd + " "

            agg = random.choice(query_pattern['agg'] + [""])
            
            if agg == "":
                _A_ = random.choice(query_pattern['cat'] + query_pattern['num']+ query_pattern['nom-cat'] + query_pattern['ord-cat'])
                if _A_ == "":
                    continue
                query += _A_ + f" FROM {collection} "
            elif agg == "COUNT":
                _A_ = random.choice(query_pattern['cat'] + query_pattern['num']+ query_pattern['nom-cat'] + query_pattern['ord-cat'] + ["*"])
                if _A_ == "":
                    continue
                query += f"{agg}({_A_}) FROM {collection} "
            elif agg == "MIN":
                _A_ = random.choice(query_pattern['num']+ query_pattern['ord-cat'])
                if _A_ == "":
                    continue
                query += f"{agg}({_A_}) FROM {collection} "
            elif agg == "MAX":
                _A_ = random.choice(query_pattern['num']+ query_pattern['ord-cat'])
                if _A_ == "":
                    continue
                query += f"{agg}({_A_}) FROM {collection} "
            elif agg == "MEDIAN":
                _A_ = random.choice(query_pattern['num']+ query_pattern['ord-cat'])
                if _A_ == "":
                    continue
                query += f"{agg}({_A_}) FROM {collection} "
            else:
                num = random.choice(query_pattern['num'])
                query += f"{agg}({num}) FROM {collection} "
                if num == "":
                    continue
            
            clause = random.choice(query_pattern['clause']+[""])

            if clause == "":
                pass
            elif clause == "WHERE":
                num = random.choice(query_pattern['num'])
                if num == "":
                    continue
                nop = random.choice(query_pattern['numeric_op'])
                numv = random.randint(schema_description[collection][col][5], schema_description[collection][col][6]+1)
                query += f"WHERE {num} {nop} {numv} "
            elif clause == "ORDER BY":
                num = random.choice(query_pattern['num'] + query_pattern['ord-cat'])
                if num == "":
                    continue
                query += f"ORDER BY {num}"
            elif clause == "GROUP BY":
                cat = random.choice(query_pattern['cat'] + query_pattern['nom-cat'])
                if cat == "":
                    continue
                query += f"GROUP BY {cat}"

            
            order_queries[agg][clause].add(query)
               
        
            _ += 1
    
    order_queries = dict(order_queries)
    
    
    for agg, clauses in order_queries.items():
        clauses = dict(clauses)
        for clause, queries in clauses.items():
            print(f"{agg} | {clause}", end="\n\t")
            for query in queries:
                print(query, end="\n\t")
            print("")
            print("-"*100)








In [4]:
schema_description=data_explore()


Database : formula_1
Collection: constructors
  Field: _id
    Data-Type: ObjectId, Count: 208, Concpt-Type: cat, Unique: True, Null: False 
  Field: constructorId
    Data-Type: int, Count: 208, Concpt-Type: nominal-cat, Unique: True, Null: False 
  Field: constructorRef
    Data-Type: str, Count: 208, Concpt-Type: cat, Unique: True, Null: False 
  Field: name
    Data-Type: str, Count: 208, Concpt-Type: cat, Unique: True, Null: False 
  Field: nationality
    Data-Type: str, Count: 208, Concpt-Type: cat, Unique: False, Null: False 
  Field: url
    Data-Type: str, Count: 208, Concpt-Type: cat, Unique: False, Null: False 
Collection: circuits
  Field: _id
    Data-Type: ObjectId, Count: 72, Concpt-Type: cat, Unique: True, Null: False 
  Field: circuitId
    Data-Type: int, Count: 72, Concpt-Type: ordinal-cat, Unique: True, Null: False 
  Field: circuitRef
    Data-Type: str, Count: 72, Concpt-Type: cat, Unique: True, Null: False 
  Field: name
    Data-Type: str, Count: 72, Concpt-Typ

In [5]:
generate_sql(schema_description)

COUNT | GROUP BY
	SELECT COUNT(*) FROM races GROUP BY url
	SELECT COUNT(date) FROM races GROUP BY circuitId
	SELECT COUNT(constructorId) FROM constructorStandings GROUP BY wins
	SELECT COUNT(*) FROM constructors GROUP BY constructorRef
	SELECT COUNT(_id) FROM constructors GROUP BY name
	SELECT COUNT(name) FROM races GROUP BY date
	SELECT COUNT(name) FROM constructors GROUP BY constructorRef
	SELECT COUNT(name) FROM constructors GROUP BY url
	
----------------------------------------------------------------------------------------------------
COUNT | 
	SELECT COUNT(url) FROM constructors 
	SELECT COUNT(nationality) FROM constructors 
	SELECT COUNT(date) FROM races 
	SELECT COUNT(constructorRef) FROM constructors 
	SELECT COUNT(url) FROM races 
	SELECT COUNT(*) FROM races 
	SELECT COUNT(constructorId) FROM constructors 
	SELECT COUNT(raceId) FROM races 
	
----------------------------------------------------------------------------------------------------
COUNT | WHERE
	SELECT COUNT(raceI

In [283]:
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

inp = "find all names of races happened in year 2009"

lemmatized_inp = " ".join([lemmatizer.lemmatize(word) for word in inp.split()])

import re 

actions = get_datamuse_synonyms('select')
aggs = []
for word in ['summation', 'minimum', 'maximum', 'average', 'median', 'count', 'variance', 'standard deviation']:
    aggs += [lemmatizer.lemmatize(synword) for synword in get_datamuse_synonyms(word)]


cols = ['names', 'date', 'time', 'round']
cols = [lemmatizer.lemmatize(word) for word in cols] #

lemmatized_cols = " ".join([lemmatizer.lemmatize(word) for word in cols])

pattern = f'(?P<action>{"|".join(actions)}).*(?P<agg>{"|".join(aggs)})?.*(?P<col>{"|".join(cols)}).*(?P<table>races).*((?P<clause>\w+)(?P<cond>\w+))?'

matches = re.search(f'{pattern}', inp)



In [4]:
from pyparsing import Word, alphas, nums, oneOf, Combine, Keyword, Group, Optional

# Define basic SQL keywords
select_keyword = Keyword("select", caseless=True)
from_keyword = Keyword("from", caseless=True)
where_keyword = Keyword("where", caseless=True)
and_keyword = Keyword("and", caseless=True)
or_keyword = Keyword("or", caseless=True)

# Define basic components
column_name = Word(alphas + "_")
table_name = Word(alphas + "_")
integer = Word(nums)
comparison_operator = oneOf("= != < > <= >=")

# Grammar for SELECT statement
select_stmt = (
    select_keyword 
    + column_name.setResultsName("column") 
    + from_keyword 
    + table_name.setResultsName("table")
    + Optional(where_keyword 
               + column_name.setResultsName("where_column") 
               + comparison_operator.setResultsName("operator") 
               + integer.setResultsName("value"))
)

# Example parsing
query = "find all names of races happened in year 2009"
parsed_result = select_stmt.parseString(query)

print(parsed_result.dump())


import nltk
from nltk import CFG

# Define a simple grammar
grammar = CFG.fromstring("""
    S -> SELECT CLAUSE FROM TABLE WHERE_COND
    SELECT -> "select"
    CLAUSE -> COLUMN
    COLUMN -> "age" | "name" | "salary" 
    FROM -> "from"
    TABLE -> "users" | "employees"
    WHERE_COND -> "where" CONDITION
    CONDITION -> COLUMN OPERATOR VALUE
    OPERATOR -> "=" | ">" | "<"
    VALUE -> "21" | "1000"
""")

# Define a parser
parser = nltk.ChartParser(grammar)

# Parse the input
query = "select age from users where age > 21"
tokens = query.split()

# Parse and print results
for tree in parser.parse(tokens):
    print(tree)


ParseException: Expected Keyword 'select', found 'find'  (at char 0), (line:1, col:1)

In [None]:
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

inp = "find all names of races happened in year 2009"

lemmatized_inp = " ".join([lemmatizer.lemmatize(word) for word in inp.split()])

import re 

actions = get_datamuse_synonyms('select')
aggs = []
for word in ['summation', 'minimum', 'maximum', 'average', 'median', 'count', 'variance', 'standard deviation']:
    aggs += [lemmatizer.lemmatize(synword) for synword in get_datamuse_synonyms(word)]


cols = ['names', 'date', 'time', 'round']
cols = [lemmatizer.lemmatize(word) for word in cols] #

lemmatized_cols = " ".join([lemmatizer.lemmatize(word) for word in cols])

pattern = f'(?P<action>{"|".join(actions)}).*(?P<agg>{"|".join(aggs)})?.*(?P<col>{"|".join(cols)}).*(?P<table>races).*((?P<clause>\w+)(?P<cond>\w+))?'

matches = re.search(f'{pattern}', inp)



In [10]:
hello_or_hi = W
hello_or_hi.parse_string("hi")

ParseResults(['h'], {})

In [2]:
%pip install pyparsing

Collecting pyparsing
  Downloading pyparsing-3.2.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pyparsing-3.2.0-py3-none-any.whl (106 kB)
Installing collected packages: pyparsing
Successfully installed pyparsing-3.2.0
Note: you may need to restart the kernel to use updated packages.


In [259]:
import requests

def get_datamuse_synonyms(word):
    url = f"https://api.datamuse.com/words?ml={word}&v=enwiki"
    response = requests.get(url).json()
    synonyms = [item['word'] for item in response]
    return synonyms

# Example usage:


def find_common_related_words(word1, word2):
    # Get related words for each input word
    url1 = f"https://api.datamuse.com/words?ml={word1}"
    url2 = f"https://api.datamuse.com/words?ml={word2}"
    
    response1 = requests.get(url1).json()
    response2 = requests.get(url2).json()
    
    # Extract related words
    related1 = {item['word'] for item in response1}
    related2 = {item['word'] for item in response2}
    
    # Find common related words
    common_related = related1.intersection(related2)
    total = related1.union(related2)
    
    return len(common_related)/len(total)

# Example usage:




0.02040816326530612


In [38]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("samruddhim/imdb-movies-analysis")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/samruddhim/imdb-movies-analysis?dataset_version_number=1...


100%|██████████| 15.2M/15.2M [00:00<00:00, 16.9MB/s]

Extracting files...





Path to dataset files: /Users/pranu/.cache/kagglehub/datasets/samruddhim/imdb-movies-analysis/versions/1


In [39]:
!mv /Users/pranu/.cache/kagglehub/datasets/samruddhim/imdb-movies-analysis/versions/1 NoSQL/

In [50]:
import json


def clean_mongo_json(data):
    """Recursively clean MongoDB JSON by removing '$'-prefixed keys."""
    if isinstance(data, dict):
        for key, value in data.items():
            if key.startswith('$') or key.startswith("_id"):
                del data[key]
                continue
            else:
                while isinstance(value, dict):
                    value = list(value.values())[0]
                    pass
            
                data[key] = value
    

    return data
    

# Load your MongoDB sample data
input_file = 'NoSQL/movie_info/comments.json'
output_file = 'NoSQL/movie_info/cleaned_comments.json'

with open(input_file, 'r') as file:
    # Load JSON data from the file
    mongo_data = file.readlines()
    for i in range(len(mongo_data)):
        mongo_data[i] = json.loads(mongo_data[i])

# Clean the MongoDB JSON
cleaned_data = clean_mongo_json(mongo_data)

# Save the cleaned data to a new file
with open(output_file, 'w') as file:
    json.dump(cleaned_data, file, indent=4)

print(f"Cleaned data saved to {output_file}")


Cleaned data saved to NoSQL/movie_info/cleaned_comments.json


In [52]:
!mongoimport --db movies_info --collection comments --file NoSQL/movie_info/comments.json 

2024-10-15T20:02:04.941-0700	connected to: mongodb://localhost/
2024-10-15T20:02:05.810-0700	50304 document(s) imported successfully. 0 document(s) failed to import.
