In [7]:
import pandas as pd
import numpy as np
import gensim.downloader

from sklearn.model_selection import train_test_split

DATA_FOLDER = 'data/'
CLEAN = 'clean_data.csv'
TRAIN = 'train_data.csv'
TEST = 'test_data.csv'

In [8]:
from string import punctuation
EXPRESSIONS_TO_REMOVE = ["\\"+x for x in list(punctuation)]


def prepare_data(data:pd.DataFrame) -> pd.DataFrame:
    data = data[['answer','clue']]   #Keep only relevant columns
    prepare_noun_required(data)
    prepare_fill_blank(data)
    clean_strings(data)
    data.drop_duplicates(['answer','clue'],keep='first', inplace=True)  #Drop repetitions of same answer/clue pairs
    data.dropna(inplace=True)
    return data

def prepare_noun_required(data:pd.DataFrame) -> pd.DataFrame:
    data['noun_involved'] = data['clue'].str.contains('[A-Z].*[A-Z]',regex=True)
    return data

def prepare_fill_blank(data:pd.DataFrame) -> pd.DataFrame:
    data['fill_blank'] = data['clue'].str.contains('_', regex=False)
    return data

def clean_strings(data:pd.DataFrame) -> pd.DataFrame:
    data['answer'] = data['answer'].str.lower().str.strip()  
    data['clue'] = data['clue'].str.lower().str.strip()
    data.replace('$', ' money ',            regex=False,    inplace=True)
    data.replace('``', '"',                 regex=False,    inplace=True)
    data.replace(r'\b\w{1,1}\b','',         regex=True,     inplace=True) 
    data.replace(EXPRESSIONS_TO_REMOVE, ' ',regex=True,     inplace=True)
    data.replace('\d+', '',                 regex=True,     inplace=True)
    data.replace('  ', ' ',                 regex=False,    inplace=True)
    data.replace(['nan',''], np.nan,        regex=False,    inplace=True)
        
    return data
    

### Clean data and save

In [9]:
df = pd.read_table('data/clues.tsv')
df = prepare_data(df)
df.to_csv(DATA_FOLDER+CLEAN, index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['noun_involved'] = data['clue'].str.contains('[A-Z].*[A-Z]',regex=True)


### Assign model vocabulary booleans and save

In [21]:
def is_in_pretrained_word2vec_model(data:pd.DataFrame, model_name:str)->pd.DataFrame:
    trained_model = gensim.downloader.load(model_name)
    vocab = [x.split('/')[-1] for x in trained_model.index_to_key]
    data[model_name] = data['answer'].isin(vocab)
    return data


In [22]:
#glove-twitter-200
df = pd.read_csv(DATA_FOLDER+CLEAN)
test = df.iloc[0]
test = is_in_pretrained_word2vec_model(test,'glove-twitter-200')
df.to_csv(DATA_FOLDER+CLEAN, index=False)

### Split data into train and test sets

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

df.to_csv(DATA_FOLDER+CLEAN, index=False)
train.to_csv(DATA_FOLDER+TRAIN,index=False)
test.to_csv(DATA_FOLDER+TEST,index=False)