In [1]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [58]:
import os
import sys
sys.path.insert(0, '../src/')
import pandas as pd
import spacy
from collections import defaultdict
from data_cleaning import raw_json_to_clean_df, split_df, df_lang
from data_transformation import transform, get_X_y, distribution
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import OneHotEncoder


## 1. Data Collection

We begin by scrapping Wiktionary.org for *feminine*, *masculine*, and *neuter* nouns in *Polish*, *German*, *Spanish*, and *French*.


In [4]:
# run webscrapper

## 2. Data Cleaning

The raw data in json format must be cleaned: removing nouns with *spaces*, *hyphens*, *numbers*, *abbreviations*, *initials* and finally those that are *proper nouns*.

In [62]:
# read json file and load it as a DataFrame
path = '../data/raw_scraped_data.json'
raw_df = pd.read_json(path)
raw_df

Unnamed: 0,noun,gender,lang
0,a,masculine,Polish
1,Aalborg,masculine,Polish
2,aalen,masculine,Polish
3,Aarhus,masculine,Polish
4,abacysta,masculine,Polish
...,...,...,...
328184,zurrona,feminine,Spanish
328185,zutana,feminine,Spanish
328186,zwingliana,feminine,Spanish
328187,zwingliana,feminine,Spanish


In [10]:
# remove numbers, hypens, spaces, and periods
no_nums = raw_df[(~raw_df['noun'].str.contains('-| |\.|1|2|3|4|5|6|7|8|9|0'))]
print(no_nums.shape)
# remove full uppercase
no_caps = no_nums[(~no_nums['noun'].str.isupper())]
print(no_caps.shape)


(301257, 3)
(299192, 3)


Break df into each language to: 
- remove duplicates in each language
- remove nouns that begin with a capital letter (except for German) 

In [18]:
spacy.load("de_core_news_sm")

# separate df for each language
dfs = df_lang(no_caps)

# remove nouns that begin with captial letters (except German)
temp = []
for sub_df in dfs:
    if sub_df.lang != 'German':
        temp.append(sub_df.df[~sub_df.df['noun'].str.istitle()])
    else: # otherwise, for German, pass into SpaCy and filter out Proper nouns
        words = pd.Series(sub_df.df['noun']).tolist()
        nlp = spacy.load("de_core_news_sm")
        text = " ".join(words) 
        nlp.max_length = len(text) 
        doc = nlp(text) 
        tokens = [token.text for token in doc if token.pos_ != 'PROPN']
        temp.append(sub_df.df[sub_df.df['noun'].isin(tokens)])
clean = pd.concat(temp)

# remove duplicates
# no_dups = no_title.drop_duplicates(subset=['noun', 'lang'], keep=False)
# print(no_dups.shape)

We know want to pass in our dataframe into SpaCy to filter out proper nouns 
(this should've been acheived on all but German already, but better to be safe than sorry)

In [19]:
clean

Unnamed: 0,noun,gender,lang
0,a,masculine,Polish
2,aalen,masculine,Polish
4,abacysta,masculine,Polish
5,abak,masculine,Polish
7,abakawir,masculine,Polish
...,...,...,...
328184,zurrona,feminine,Spanish
328185,zutana,feminine,Spanish
328186,zwingliana,feminine,Spanish
328187,zwingliana,feminine,Spanish


## 3 Transform and Encode Data
reduce data down to an even amount of examples per language and per gender
encode last 3 letters of each noun

In [32]:
# get length of longest noun
max_length = clean['noun'].str.len().max()

def add_filler(word):
    if len(word) < max_length:
        diff = max_length - len(word)
        return '#' * diff + word
    return word

# apply function to every value in column 'noun'

clean['noun'] = clean['noun'].apply(add_filler)
print(clean.shape)

(261970, 3)


In [59]:
grouped = clean.groupby(['gender','lang']).size().unstack()
lowest_value = int(grouped.min().min())
print('lowest_value = ', lowest_value)
print(distribution(clean))

lowest_value =  2392
lang       French  German  Polish   Spanish
gender                                     
feminine   3801.0  3624.0  2555.0  116375.0
masculine  3911.0  2485.0  2681.0  119263.0
neuter        NaN  2392.0  4883.0       NaN


In [60]:
def trans(dframe, n=0):
    reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
    to_be_encoded = reduced_df['noun'].str[-n:] # grab n amount of letters start from the end to encode only
    ohe = OneHotEncoder(sparse=False) # initialize the encoder
    transformed = ohe.fit_transform(to_be_encoded.to_numpy().reshape(-1, 1)) # encode
    transformed_df = pd.DataFrame(transformed) # convert to a dataframe
    reduced_df.reset_index(inplace=True, drop=True) # reset indexes
    return pd.concat([reduced_df, transformed_df], axis=1) # create new dataframe of reduced df and transformed df

In [61]:
trans_df = trans(clean, 3) # taking only the last three letters
trans_df.head(5)

  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value


Unnamed: 0,noun,gender,lang,0,1,2,3,4,5,6,...,2476,2477,2478,2479,2480,2481,2482,2483,2484,2485
0,##########################uricotélisme,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,############################navigation,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,#################################zikri,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,###############################gaduine,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,################################uranie,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4 Train Data

## 4.1 Define X and y

In [16]:
X, y = get_X_y(trans_df)

## 4.2 Split data into training and testing

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 4.3 K-NN

In [18]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.8213746223564955

### 4.4 Perceptron

In [19]:
p = Perceptron(random_state=42)
p.fit(X_train, y_train)
p.score(X_test, y_test)

0.8421450151057401

Running K-NN and Percetron one by one from last letter until the whole word is encoded

In [43]:
# from collections import defaultdict
# def multi_train(df):
#     results = defaultdict(list)
#     max_length = df['noun'].str.len().max()
#     for n in range(1, max_length + 1):
#         trans_df = trans(df, n) # taking only n amount of letters
#         X, y = get_X_y(trans_df)
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


        
#         knn = KNeighborsClassifier(n_neighbors=3)
#         knn.fit(X_train, y_train)
#         results['KNN'].append(knn.score(X_test, y_test))


#         p = Perceptron(random_state=42)
#         p.fit(X_train, y_train)
#         p.score(X_test, y_test)
#         results['Perceptron'].append(p.score(X_test, y_test))

#     return results

In [44]:
# res = multi_train(clean)
# res

  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduc

defaultdict(list,
            {'KNN': [0.6321070234113713,
              0.7648411371237458,
              0.8018394648829431,
              0.7974498327759197,
              0.6494565217391305,
              0.5865384615384616,
              0.5854933110367893,
              0.5468227424749164,
              0.3181438127090301,
              0.47575250836120403,
              0.47679765886287623,
              0.4306020066889632,
              0.4834866220735786,
              0.47491638795986624,
              0.4676003344481605,
              0.4387541806020067,
              0.43164715719063546,
              0.46488294314381273,
              0.4627926421404682,
              0.4667642140468227,
              0.46801839464882944,
              0.2244983277591973,
              0.4698996655518395,
              0.5137959866220736,
              0.4343645484949833,
              0.4306020066889632,
              0.5054347826086957,
              0.4761705685618729,
              0.4

Just Spanish, being trained and tested on Spanish

In [56]:
def multi_train_per_lang(df):
    results = defaultdict(lambda :defaultdict(list))
    max_length = df['noun'].str.len().max()
    dfs = df_lang(df)
    for sub_df in dfs:
        for n in range(1, max_length + 1):
            trans_df = trans(sub_df.df, n) # taking only n amount of letters
            X, y = get_X_y(trans_df)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


            
            knn = KNeighborsClassifier(n_neighbors=3)
            knn.fit(X_train, y_train)
            results['KNN'][n].append((knn.score(X_test, y_test), sub_df.lang))


            p = Perceptron(random_state=42)
            p.fit(X_train, y_train)
            p.score(X_test, y_test)
            results['Perceptron'][n].append((p.score(X_test, y_test), sub_df.lang))

    return results

In [57]:
all_langs = multi_train_per_lang(clean)
all_langs

  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
  reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduc

defaultdict(<function __main__.multi_train_per_lang.<locals>.<lambda>()>,
            {'KNN': defaultdict(list,
                         {1: [(0.942200557103064, 'Polish'),
                           (0.6525069637883009, 'German'),
                           (0.7063740856844305, 'French'),
                           (0.8244514106583072, 'Spanish')],
                          2: [(0.958217270194986, 'Polish'),
                           (0.7604456824512534, 'German'),
                           (0.8035527690700105, 'French'),
                           (0.8474399164054337, 'Spanish')],
                          3: [(0.9115598885793872, 'Polish'),
                           (0.8175487465181058, 'German'),
                           (0.7210031347962382, 'French'),
                           (0.7857889237199582, 'Spanish')],
                          4: [(0.7388579387186629, 'Polish'),
                           (0.8022284122562674, 'German'),
                           (0.8056426332288401