In [12]:
from gensim.models import KeyedVectors
import sqlite3
from tqdm import tqdm
from stop_words import get_stop_words
import re
import numpy as np
from unidecode import unidecode 

In [13]:
WORD_MODEL = "frWiki_no_phrase_no_postag_1000_skip_cut200.bin"
OUTPUT_DB = "word2vec2.db"

In [14]:
secret_words = ["machin", 
                "chose",
               ] 

In [15]:
def connect_to_db(db_file, WAL = False):
    con = sqlite3.connect(db_file)
    if WAL:
        con.execute("PRAGMA journal_mode=WAL")
    cur = con.cursor()
    return cur, con

In [16]:
def table_exist(table_name):
    query = f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'"
    con = sqlite3.connect(OUTPUT_DB)
    cur = con.cursor()
    cur.execute(query)
    return not cur.fetchone() is None

In [17]:
def get_next_day():
    
    pattern = 'day([0-9]+)'
    reg = re.compile(pattern)
    def tables_in_sqlite_db(conn):
        cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'day%';")
        tables = [
            v[0] for v in cursor.fetchall()
            if v[0] != "sqlite_sequence"
        ]
        cursor.close()
        return tables

    list_days = [int(s.split('day')[1]) for s in tables_in_sqlite_db(con)]


    if not list_days:
        new_day = 1
    else:
        latest_day = np.max(list_days)
        new_day = latest_day+1
    return new_day

# Load things

## Load model

In [18]:
model = KeyedVectors.load_word2vec_format(
    WORD_MODEL,
    binary=True, 
    unicode_errors="ignore"
)
all_words = model.index_to_key

## Load stop words

In [8]:
french_stop_words = get_stop_words('french')

# First add the full list of known words if it does not exists already

In [21]:
if (table_exist("all_words_fr")):
    print('List of all words not present, writing it.')
    cur, con = connect_to_db(OUTPUT_DB)
    cur.execute("create table if not exists all_words_fr (word text PRIMARY KEY)")
    con.commit()
    
    con.execute("DELETE FROM all_words_fr")
    with con:
        con.executemany(
            "insert into all_words_fr values(?)",
            [[w] for w in all_words],
        )
    con.close()
else:
    print('List of all words already there, nothing to do.')

Word list already there, nothing to do.


# Find the first day without a word
to know where we need to add the words

In [22]:
print('Adding words to secret list...')
cur, con = connect_to_db(OUTPUT_DB)

for secret_word in secret_words:
    list_of_related_words = model.most_similar(secret_word, topn=1500,)
    list_of_related_words = [(secret_word, 1.)] + list_of_related_words
    
    # remove stop words
    list_of_related_words = [(w,s) for (w,s) in list_of_related_words if not (w in french_stop_words)]
    
    

    # Add score
    score = list(range(1000,0,-1))
    list_of_related_words = [list(l)+[s] for l,s in zip(list_of_related_words, score)]

    # Create table
    new_day = get_next_day()
    cur.execute(f"create table if not exists day{new_day} (word text PRIMARY KEY, value NUMERIC, score int)")
    con.commit()

    # Add data
    con.execute(f"DELETE FROM day{new_day}")
    with con:
        con.executemany(
            f"insert into day{new_day} values(?,?,?)",
            list_of_related_words,
        )

Adding words to secret list...
