In [10]:
import spacy
import pandas as pd
from typing import List, Tuple
from spacy.language import Language
from collections import namedtuple
from data  import *


ModuleNotFoundError: No module named 'data'

## 1. Data Collection

We begin by scrapping Wiktionary.org for *feminine*, *masculine*, and *neuter* nouns in *Polish*, *German*, *Spanish*, and *French*.


In [None]:
# run webscrapper

## 2. Data Cleaning

The raw data in json format must be cleaned: removing nouns with *spaces*, *hyphens*, *numbers*, *abbreviations*, *initials* and finally those that are *proper nouns*.

In [4]:
# read json file and load it as a DataFrame
path = '../data/raw_scraped_data.json'
df = pd.read_json(path)

In [11]:
# intial filtering of hypen, blank space, all digits and all caps words
df = 

In [12]:
# split dataframe into smaller ones by language
def split_df(df: pd.DataFrame)-> List[Tuple[str, pd.DataFrame]]:
    """
    Splits main df by 'lang' column, and creates new sub
    dataFrames

    returns:
        list: namedtuple (lang, df)
    """
    Sub_df = namedtuple('Sub_df', ['lang', 'df'])
    languages = df['lang'].unique().tolist()
    dataframes = [df[df['lang'] == lang] for lang in languages]
    return [Sub_df(lang, sub_df) for lang, sub_df in zip(languages, dataframes)]

In [13]:
# match sub dataFrames with its Spacy language model respectively
def sub_df_and_model(df: pd.DataFrame)-> List[Tuple[pd.DataFrame, Language]]:
    """
    
    """
    Model = namedtuple('Model', ['lang', 'nlp'])
    Df_nlp = namedtuple('Df_and_Model', ['df', 'nlp'])
    sub_dfs = split_df(df) # list of all sub DataFrames based on language
    d = {'Spanish': 'es', 'French': 'fr', 'German': 'de', 'Polish': 'pl'}
    models = [Model(lang, spacy.load(lang + '_core_news_sm')) for lang in d.values()]
    return [Df_nlp(sub_df.df, model.nlp) for sub_df, model in zip(sub_dfs, models)]

In [14]:
def good_pos_list(tup: Tuple[List[str], Language])-> List[str]:
    """
    takes in a namedtuple, uses the list of nouns stored in tup.words
    and passes each word into SpaCy POS tagger, appending only the nouns
    NOT labeled as Proper Nouns

    returns:
        list: nouns (str)
    """
    Noun = namedtuple('Noun', ['text', 'lemma'])
    nlp = tup.nlp 
    text = " ".join(tup.words) # all nouns from list into a str
    nlp.max_length = len(text) # increase the length the parser can handle
    doc = nlp(text) 
    print(f"len of doc = {len(doc)}")
    return [Noun(token.text, token.lemma_) for token in doc if token.pos_ != 'PROPN']

In [15]:
def clean_df(df: pd.DataFrame)-> pd.DataFrame:
    """
    takes in a DataFrame, creates sub dataframes based on each unique language,
    then takes each word found in each sub dataframe and passes it into SpaCy
    POS tagger and filters out nouns NOT labeled as Proper Nouns, utlimately
    return a list of sub dataframes complelety populated by nouns in each
    given language.

    returns:
        res(list): list of sub dataframes per language
    """
    Data = namedtuple('Data', ['words', 'nlp']) # namedtuple to hold our data as words and nlp
    df_and_nlp = sub_df_and_model(df) # sub dataFrames and spacy nlp models
    # res = pd.DataFrame() # empty dataFrame

    tup = df_and_nlp[0] # for every tuple (sub DataFrame, spacy model)
    print(f"tup df len = {tup.df.shape}")
    data = Data(pd.Series(tup.df['noun']).tolist(), tup.nlp) # create a Data namedtuple (list of nouns, specific language model)
    words ,lemmas = zip(*good_pos_list(data))

    print(f"len of words = {len(words)} and len of lemmas = {len(lemmas)}")
    data_tuples = list(zip(words, lemmas))
    cool = pd.DataFrame(data_tuples, columns=['noun', 'lemma'])
    small = tup.df[tup.df['noun'].isin(words)]
    df2 = pd.concat([cool, small[['gender', 'lang']]], axis=1)
    print(cool.shape)
    print(small.shape)
    # res = pd.DataFrame() # empty dataFrame

    # tup.df['lemma'] = lemmas
    # temp = tup.df[tup.df['noun'].isin(words)]
    # return temp, tup.df
        # lemma = pd.DataFrame(lemmas)
        # temp = tup.df[tup.df['noun'].isin(words)]
        # new = temp['lemma'] = lemmas
        # print(new)
        # res.append(tup.df[tup.df['noun'].isin(words)]) # append a sub DataFrame per language with nouns verified as non-proper nouns via SpaCy
        
    # return res # return the new list of sub dataFrames

In [16]:
temp, res = clean_df(filtered)

tup df len = (15147, 3)
len of doc = 15148
len of words = 5201 and len of lemmas = 5201
(5201, 2)
(5244, 3)


TypeError: cannot unpack non-iterable NoneType object

In [10]:
temp

NameError: name 'temp' is not defined

In [42]:
clean_df(filtered)

5201
5201
3791
3791
5058
5058
210678
210678


[]

In [20]:
Data = namedtuple('Data', ['words', 'nlp'])

for tup in df_and_nlp: # for every tuple (sub DataFrame, spacy model)
        data = Data(pd.Series(tup.df['noun']).tolist(), tup.nlp) # create a Data namedtuple (list of nouns, specific language model)
        words ,lemmas = zip(*good_pos_list(data))

In [35]:
lemma = pd.DataFrame(lemmas)
lemma

Unnamed: 0,0
0,aaronita
1,ababábity
2,ababol
3,a
4,aaronit
...,...
210673,zurra
210674,zurribanda
210675,zurryć
210676,zutać


In [37]:
c = tup.df[tup.df['noun'].isin(words)]
c
d = c['lemma'] = lemmas

ValueError: Length of values (210678) does not match length of index (215889)

In [102]:
def sub_to_json(sub_dfs: List[pd.DataFrame])-> None:
    """
    takes a list of sub dataFrames and creates a json for each
    the name of the file is modified by the particular language the sub DataFrame
    represents
    """
    base = '../data/'
    filename = '_cleaned_data.csv'
    for df in sub_dfs:
        lang = df['lang'].unique().tolist()[0] # get only value in column 'lang'
        df.to_csv(base + lang + filename, index=False)

In [13]:
res = clean_df(filtered)

               0
0              a
1          aalir
2       abacysta
3              a
4        abpstwo
...          ...
5196     zalążek
5197       zalew
5198       załom
5199  założyciel
5200      zamiar

[5201 rows x 1 columns]
                 0
0          aalhaut
1        aalleiter
2         aalsuppe
3         aarmühle
4        aasfliege
...            ...
3786   ziehtochter
3787       Ziehung
3788  zielfunktion
3789    Zielgerade
3790     ziellinie

[3791 rows x 1 columns]
                  0
0             abaca
1           Abacost
2       Abaissement
3         Abaisseur
4     Abandonnateur
...             ...
5053       zézayeur
5054           zgeg
5055         zgégos
5056          zgueg
5057        zguègue

[5058 rows x 1 columns]
                 0
0         aaronita
1        ababábity
2           ababol
3                a
4          aaronit
...            ...
210673       zurra
210674  zurribanda
210675      zurryć
210676       zutać
210677   zwingliać

[210678 rows x 1 columns

In [11]:
res

[         0 gender lang noun
 0      NaN    NaN  NaN  NaN
 1      NaN    NaN  NaN  NaN
 2      NaN    NaN  NaN  NaN
 3      NaN    NaN  NaN  NaN
 4      NaN    NaN  NaN  NaN
 ...    ...    ...  ...  ...
 56388  NaN    NaN  NaN  NaN
 56463  NaN    NaN  NaN  NaN
 56474  NaN    NaN  NaN  NaN
 56475  NaN    NaN  NaN  NaN
 56487  NaN    NaN  NaN  NaN
 
 [9791 rows x 4 columns],
          0 gender lang noun
 0      NaN    NaN  NaN  NaN
 1      NaN    NaN  NaN  NaN
 2      NaN    NaN  NaN  NaN
 3      NaN    NaN  NaN  NaN
 4      NaN    NaN  NaN  NaN
 ...    ...    ...  ...  ...
 56195  NaN    NaN  NaN  NaN
 56196  NaN    NaN  NaN  NaN
 56198  NaN    NaN  NaN  NaN
 57201  NaN    NaN  NaN  NaN
 58402  NaN    NaN  NaN  NaN
 
 [7458 rows x 4 columns],
          0 gender lang noun
 0      NaN    NaN  NaN  NaN
 1      NaN    NaN  NaN  NaN
 2      NaN    NaN  NaN  NaN
 3      NaN    NaN  NaN  NaN
 4      NaN    NaN  NaN  NaN
 ...    ...    ...  ...  ...
 58797  NaN    NaN  NaN  NaN
 58798  NaN    N

In [104]:
# read cleaned data
import os
path = '../data/'


def grab_data(path):
    return [file for file in os.listdir(path) if 'csv' in file]



In [108]:
def load_data(path):
    Example = namedtuple('Example', ['lang', 'df'])
    filenames = grab_data(path)
    res = []
    for filename in filenames:
        df = pd.read_csv(path + filename)
        lang, *_ = filename.split('_')
        res.append(df)
    return res

In [17]:
j = load_data(path)

NameError: name 'load_data' is not defined

In [110]:
j[0]

Unnamed: 0,noun,gender,lang
0,abaca,masculine,French
1,abacost,masculine,French
2,abaissement,masculine,French
3,abaisseur,masculine,French
4,abandonnataire,masculine,French
...,...,...,...
5164,zézayeur,masculine,French
5165,zgeg,masculine,French
5166,zgégos,masculine,French
5167,zgueg,masculine,French


In [165]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


fr = j[0]
ohe = OneHotEncoder(sparse=False, categories='auto')
transformed = ohe.fit_transform(fr['noun'].to_numpy().reshape(-1, 1))




In [177]:
new = pd.DataFrame(transformed)
new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4310,4311,4312,4313,4314,4315,4316,4317,4318,4319
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
w = pd.DataFrame(y)
w.columns = ['encoding']

In [171]:
data = pd.concat([fr, new], axis=1)
data

Unnamed: 0,noun,gender,lang,0,1,2,3,4,5,6,...,4310,4311,4312,4313,4314,4315,4316,4317,4318,4319
0,abaca,masculine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,abacost,masculine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,abaissement,masculine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,abaisseur,masculine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,abandonnataire,masculine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,zézayeur,masculine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5165,zgeg,masculine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5166,zgégos,masculine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5167,zgueg,masculine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
