In [1]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import sys
sys.path.insert(0, '../src/')
import pandas as pd
import spacy
from collections import defaultdict
# from data_cleaning import split_df, df_lang
from data_transformation import get_X_y, distribution
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import OneHotEncoder


## 1. Data Collection

We begin by scrapping Wiktionary.org for *feminine*, *masculine*, and *neuter* nouns in *Polish*, *German*, *Spanish*, and *French*.


In [3]:
# run webscrapper

## 2. Data Cleaning

The raw data in json format must be cleaned: removing nouns with *spaces*, *hyphens*, *numbers*, *abbreviations*, *initials* and finally those that are *proper nouns*.

### 2.1 Read data

In [4]:
# read json file and load it as a DataFrame
path = '../data/raw_scraped_data.json'
raw_df = pd.read_json(path)
raw_df

Unnamed: 0,noun,gender,lang
0,a,masculine,Polish
1,Aalborg,masculine,Polish
2,aalen,masculine,Polish
3,Aarhus,masculine,Polish
4,abacysta,masculine,Polish
...,...,...,...
328184,zurrona,feminine,Spanish
328185,zutana,feminine,Spanish
328186,zwingliana,feminine,Spanish
328187,zwingliana,feminine,Spanish


### 2.2 Initial filter

In [5]:
# remove numbers, hypens, spaces, and periods
no_nums = raw_df[(~raw_df['noun'].str.contains('-| |\.|1|2|3|4|5|6|7|8|9|0'))]
print(no_nums.shape)
# remove full uppercase
no_caps = no_nums[(~no_nums['noun'].str.isupper())]
print(no_caps.shape)


(301257, 3)
(299192, 3)


### 2.3 Further cleaning

Break df into each language to: 
- remove duplicates in each language
- remove nouns that begin with a capital letter (except for German) 

In [6]:
from typing import List, Tuple
from spacy.language import Language
from collections import namedtuple


def df_lang(df: pd.DataFrame)-> List[Tuple[str, pd.DataFrame]]:
    """
    Splits main df by 'lang' column, and creates new sub
    dataFrames

    returns:
        list: namedtuple (lang, df)
    """
    Sub_df = namedtuple('Sub_df', ['lang', 'df'])
    languages = df['lang'].unique()
    dataframes = [df[df['lang'] == lang] for lang in languages]
    return [Sub_df(lang, sub_df) for lang, sub_df in zip(languages, dataframes)]

In [7]:
spacy.load("de_core_news_sm")

# separate df for each language
dfs = df_lang(no_caps)

# remove nouns that begin with captial letters (except German)
temp = []
for sub_df in dfs:
    if sub_df.lang != 'German':
        temp.append(sub_df.df[~sub_df.df['noun'].str.istitle()])
    else: # otherwise, for German, pass into SpaCy and filter out Proper nouns
        words = pd.Series(sub_df.df['noun']).tolist()
        nlp = spacy.load("de_core_news_sm")
        text = " ".join(words) 
        nlp.max_length = len(text) 
        doc = nlp(text) 
        tokens = [token.text for token in doc if token.pos_ != 'PROPN']
        temp.append(sub_df.df[sub_df.df['noun'].isin(tokens)])
clean = pd.concat(temp)

In [8]:
clean

Unnamed: 0,noun,gender,lang
0,a,masculine,Polish
2,aalen,masculine,Polish
4,abacysta,masculine,Polish
5,abak,masculine,Polish
7,abakawir,masculine,Polish
...,...,...,...
328184,zurrona,feminine,Spanish
328185,zutana,feminine,Spanish
328186,zwingliana,feminine,Spanish
328187,zwingliana,feminine,Spanish


## 3 Transform and Encode data
#### 3.1 Create function to add # to all nouns
Make all nouns the same length, this is done by preprending '#' to nouns until they are a uniform length


In [9]:
# get length of longest noun
max_length = clean['noun'].str.len().max() # 38

def add_filler(word):
    """ preprends n amount #'s to a word 
    based on a max_length"""
    if len(word) < max_length: # if word len() is less than the max len()
        diff = max_length - len(word) # we subtract the current word len() by the max len(): diff
        return '#' * diff + word # we prepend n(diff) amount of '#'s to the word and then return it
    return word # if len() of word is NOT less then the max len, then just return it

# apply function to every value in column 'noun'
clean['noun'] = clean['noun'].apply(add_filler)
print(clean.head(5))

                                     noun     gender    lang
0  #####################################a  masculine  Polish
2  #################################aalen  masculine  Polish
4  ##############################abacysta  masculine  Polish
5  ##################################abak  masculine  Polish
7  ##############################abakawir  masculine  Polish


#### 3.2 
get a distribution of all languages and genders and get the lowest count

In [10]:
grouped = clean.groupby(['gender','lang']).size().unstack()
lowest_value = int(grouped.min().min())
print(distribution(clean))
print('lowest_value = ', lowest_value)


lang       French  German  Polish   Spanish
gender                                     
feminine   3801.0  3624.0  2555.0  116375.0
masculine  3911.0  2485.0  2681.0  119263.0
neuter        NaN  2392.0  4883.0       NaN
lowest_value =  2392


### 3.3 Reduce all language sub dfs and genders to a uniform amount (lowest_value)

In [11]:
reduced_df = clean.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
print(distribution(reduced_df))

lang       French  German  Polish  Spanish
gender                                    
feminine   2392.0  2392.0  2392.0   2392.0
masculine  2392.0  2392.0  2392.0   2392.0
neuter        NaN  2392.0  2392.0      NaN


  reduced_df = clean.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value


#### 3.3 Define an encoding function
aim is to grab n amount of letters in a noun (starting from the end), and passing it into a one-hot encoder that will represent said letter(s) as a vector.
1. grab n amount of letters from each noun (this will be encoded)
2. intialize an encoder
3. encode n amount of letters
4. convert results into a dataframe (transformed df)
5. re index reduced df
6. return new dataframe which is made of : reduced df and transformed df

In [12]:
def encode(reduced_df, n=0):
    to_be_encoded = reduced_df['noun'].str[-n:] # grab n amount of letters start from the end to encode only
    ohe = OneHotEncoder(sparse_output=False) # initialize the encoder
    transformed = ohe.fit_transform(to_be_encoded.to_numpy().reshape(-1, 1)) # encode
    transformed_df = pd.DataFrame(transformed) # convert to a dataframe
    reduced_df.reset_index(inplace=True, drop=True) # reset indexes
    return pd.concat([reduced_df, transformed_df], axis=1) # create new dataframe of reduced df and transformed df

## 4 Train, test, fit, score

#### 4.1 create function to train,fit, test all languages with n amount of encoding



In [13]:
def multi_train_per_lang(df):
    results = defaultdict(lambda :defaultdict(list)) # to store scores from ML Models. dict->dict->list
    max_length = df['noun'].str.len().max() # get longest noun in whole dataset
    dfs = df_lang(df) # break df into smalled dfs based on language: spanish_df, french_df etc
    for sub_df in dfs: # for each language df
        for n in range(1, max_length + 1): # 
            encoded_df = encode(sub_df.df, n) # encode n amount of letters for the ith langauge df
            X, y = get_X_y(encoded_df) # X is the vector representationf for n amount of letters, y are the labels
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

            # now that we have a train test split, we can plug them into our ML models
            # KNN
            knn = KNeighborsClassifier(n_neighbors=3) # initialize a KNN class, 3 neighbors
            knn.fit(X_train, y_train) # train it 
            results['KNN'][n].append((knn.score(X_test, y_test), sub_df.lang)) # append score into results dict, along with name of sub df (French, German, etc)

            # Perceptron
            p = Perceptron(random_state=42) # initialize a Perceptron class, random state 42
            p.fit(X_train, y_train) # train it
            results['Perceptron'][n].append((p.score(X_test, y_test), sub_df.lang)) # append score to results dict, along with name of sub df (French, German, etc)

    return results # return s scores of l sub dfs in df, of n amount of letters encode, for c amount of ML models. Ex: results[c][n]: (s,l)

## 4.2 Experiment 1: train and test each language individually on n amount of letters encoded 
## takes ~5 mins to run!!

In [14]:
all_langs = multi_train_per_lang(reduced_df)
all_langs

defaultdict(<function __main__.multi_train_per_lang.<locals>.<lambda>()>,
            {'KNN': defaultdict(list,
                         {1: [(0.5015673981191222, 'French'),
                           (0.6629526462395543, 'German'),
                           (0.9408077994428969, 'Polish'),
                           (0.8045977011494253, 'Spanish')],
                          2: [(0.7628004179728317, 'French'),
                           (0.7305013927576601, 'German'),
                           (0.9554317548746518, 'Polish'),
                           (0.8505747126436781, 'Spanish')],
                          3: [(0.7795193312434692, 'French'),
                           (0.7834261838440112, 'German'),
                           (0.9540389972144847, 'Polish'),
                           (0.7931034482758621, 'Spanish')],
                          4: [(0.8254963427377221, 'French'),
                           (0.782033426183844, 'German'),
                           (0.828690807799442

### 4.3 Show results for KNN

In [15]:
knn_results = all_langs['KNN']
knn_df = pd.DataFrame.from_dict(knn_results, orient="index", columns=['Polish', 'German', 'French', 'Spanish'])
knn_iterative_df = knn_df.applymap(lambda x: x[0]) # grab first element in the tuple (score, langauage)
knn_iterative_df.to_csv('../data/knn_per_language_results.csv') # save to csv
knn_iterative_df # display dataframe

Unnamed: 0,Polish,German,French,Spanish
1,0.501567,0.662953,0.940808,0.804598
2,0.7628,0.730501,0.955432,0.850575
3,0.779519,0.783426,0.954039,0.793103
4,0.825496,0.782033,0.828691,0.677116
5,0.761755,0.669916,0.755571,0.77116
6,0.629049,0.688022,0.45195,0.525601
7,0.532915,0.518106,0.395543,0.648903
8,0.559039,0.412953,0.478412,0.643678
9,0.503657,0.421309,0.537604,0.576803
10,0.508882,0.499304,0.474234,0.543365


### 4.4 Show results for Perceptron

In [16]:
perceptron_results = all_langs['Perceptron']
per_df = pd.DataFrame.from_dict(perceptron_results, orient="index", columns=['Polish', 'German', 'French', 'Spanish'])
per_iterative_df = per_df.applymap(lambda x: x[0]) # grab first element in the tuple (score, langauage)
per_iterative_df.to_csv('../data/perceptron_per_language_results.csv') # save as csv
per_iterative_df # display df


Unnamed: 0,Polish,German,French,Spanish
1,0.706374,0.513928,0.938719,0.811912
2,0.803553,0.738162,0.956128,0.84117
3,0.838036,0.817549,0.956128,0.888192
4,0.683386,0.791086,0.89624,0.835946
5,0.756531,0.727716,0.745822,0.669801
6,0.573668,0.665042,0.662953,0.601881
7,0.634274,0.584262,0.62883,0.61442
8,0.555904,0.526462,0.572423,0.585162
9,0.527691,0.498607,0.537604,0.577847
10,0.516196,0.45961,0.506267,0.596656


## 4.5 Expermiment 2: train on one lang, and test on a different langauage (shuffle)

In [17]:
def multi_train_per_lang_shuffle(df, max_length):
    results = defaultdict(lambda :defaultdict(lambda: defaultdict()))
    training_data = defaultdict()
    testing_data = defaultdict()
    trans_df = encode(df, max_length) # encode
    dfs = df_lang(trans_df)
    for sub_df in dfs: # for each language
        X, y = get_X_y(sub_df.df) # get X and y
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # split
        training_data[sub_df.lang] = (X_train, y_train) # add the x and y train for each language into a dict
        testing_data[sub_df.lang] = (X_test, y_test) # add the x and y testing for each language into a dict
        
    for i in training_data: # for every language training
        for j in testing_data: # for every language testing
       
            X_train__, y_train__ = training_data[i]
            X_test__, y_test__ = testing_data[j]


            knn = KNeighborsClassifier(n_neighbors=3) # initialize a KNN
            knn.fit(X_train__, y_train__) # fit it with training
            results['KNN'][i][j] = knn.score(X_test__, y_test__) # test it
 
            p = Perceptron(random_state=42) # initialize a Perceptron
            p.fit(X_train__, y_train__) # fit it with training
            results['Perceptron'][i][j] = p.score(X_test__, y_test__) # test it


    return results

### 4.6 Run experiment on only 4 letters encoded

In [18]:
shuffled = multi_train_per_lang_shuffle(reduced_df, 4)
shuffled


defaultdict(<function __main__.multi_train_per_lang_shuffle.<locals>.<lambda>()>,
            {'KNN': defaultdict(<function __main__.multi_train_per_lang_shuffle.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'French': defaultdict(None,
                                      {'French': 0.778010033444816,
                                       'German': 0.358974358974359,
                                       'Polish': 0.3333333333333333,
                                       'Spanish': 0.5062709030100334}),
                          'German': defaultdict(None,
                                      {'French': 0.20986622073578595,
                                       'German': 0.7787068004459309,
                                       'Polish': 0.241917502787068,
                                       'Spanish': 0.016304347826086956}),
                          'Polish': defaultdict(None,
                                      {'French': 0.5083612040133779,
         

### 4.7 show results of KNN

In [19]:
knn_shuffle_res = shuffled['KNN']
knn_shuffle_df = pd.DataFrame(knn_shuffle_res)
knn_shuffle_df.to_csv('../data/knn_res_lang_shuffle_4_encoding.csv') # save as csv
knn_shuffle_df

Unnamed: 0,French,German,Polish,Spanish
French,0.77801,0.209866,0.508361,0.510033
German,0.358974,0.778707,0.36427,0.345039
Polish,0.333333,0.241918,0.718785,0.32971
Spanish,0.506271,0.016304,0.48286,0.706522


### 4.8 Show results of Perceptron

In [20]:
percp_shuffle_res = shuffled['Perceptron']
percep_shuffle_df = pd.DataFrame(percp_shuffle_res)
percep_shuffle_df.to_csv('../data/percp_res_lang_shuffle_4_encoding.csv') # save as csv
percep_shuffle_df

Unnamed: 0,French,German,Polish,Spanish
French,0.80393,0.219482,0.499582,0.511288
German,0.361761,0.784002,0.327202,0.35563
Polish,0.334169,0.240524,0.86204,0.332776
Spanish,0.501254,0.015886,0.552258,0.714883


## 4.9 Experiment 3: Train and test on WHOLE dataset as one

In [21]:
encoded_df = encode(reduced_df, 4) #  4 letters encoded
X, y = get_X_y(encoded_df) # X is the vector representationf for n amount of letters, y are the labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# KNN
knn = KNeighborsClassifier(n_neighbors=3) # initialize a KNN class, 3 neighbors
knn.fit(X_train, y_train) # train it 
knn_score = knn.score(X_test, y_test) # get score

# Perceptron
p = Perceptron(random_state=42) # initialize a Perceptron class, random state 42
p.fit(X_train, y_train) # train it
p_score = p.score(X_test, y_test) # get score
