In [1]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import sys
sys.path.insert(0, '../src/')
import pandas as pd
import spacy
from collections import defaultdict
# from data_cleaning import split_df, df_lang
from data_transformation import get_X_y, distribution
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import OneHotEncoder


## 1. Data Collection

We begin by scrapping Wiktionary.org for *feminine*, *masculine*, and *neuter* nouns in *Polish*, *German*, *Spanish*, and *French*.


In [3]:
# run webscrapper

## 2. Data Cleaning

The raw data in json format must be cleaned: removing nouns with *spaces*, *hyphens*, *numbers*, *abbreviations*, *initials* and finally those that are *proper nouns*.

### 2.1 Read data

In [4]:
# read json file and load it as a DataFrame
path = '../data/raw_scraped_data.json'
raw_df = pd.read_json(path)
raw_df

Unnamed: 0,noun,gender,lang
0,a,masculine,Polish
1,Aalborg,masculine,Polish
2,aalen,masculine,Polish
3,Aarhus,masculine,Polish
4,abacysta,masculine,Polish
...,...,...,...
328184,zurrona,feminine,Spanish
328185,zutana,feminine,Spanish
328186,zwingliana,feminine,Spanish
328187,zwingliana,feminine,Spanish


### 2.2 Initial filter

In [5]:
# remove numbers, hypens, spaces, and periods
no_nums = raw_df[(~raw_df['noun'].str.contains('-| |\.|1|2|3|4|5|6|7|8|9|0'))]
print(no_nums.shape)
# remove full uppercase
no_caps = no_nums[(~no_nums['noun'].str.isupper())]
print(no_caps.shape)


(301257, 3)
(299192, 3)


### 2.3 Further cleaning

Break df into each language to: 
- remove duplicates in each language
- remove nouns that begin with a capital letter (except for German) 

In [6]:
from typing import List, Tuple
from spacy.language import Language
from collections import namedtuple


def df_lang(df: pd.DataFrame)-> List[Tuple[str, pd.DataFrame]]:
    """
    Splits main df by 'lang' column, and creates new sub
    dataFrames

    returns:
        list: namedtuple (lang, df)
    """
    Sub_df = namedtuple('Sub_df', ['lang', 'df'])
    languages = df['lang'].unique()
    dataframes = [df[df['lang'] == lang] for lang in languages]
    return [Sub_df(lang, sub_df) for lang, sub_df in zip(languages, dataframes)]

In [7]:
spacy.load("de_core_news_sm")

# separate df for each language
dfs = df_lang(no_caps)

# remove nouns that begin with captial letters (except German)
temp = []
for sub_df in dfs:
    if sub_df.lang != 'German':
        temp.append(sub_df.df[~sub_df.df['noun'].str.istitle()])
    else: # otherwise, for German, pass into SpaCy and filter out Proper nouns
        words = pd.Series(sub_df.df['noun']).tolist()
        nlp = spacy.load("de_core_news_sm")
        text = " ".join(words) 
        nlp.max_length = len(text) 
        doc = nlp(text) 
        tokens = [token.text for token in doc if token.pos_ != 'PROPN']
        temp.append(sub_df.df[sub_df.df['noun'].isin(tokens)])
clean = pd.concat(temp)

In [8]:
clean

Unnamed: 0,noun,gender,lang
0,a,masculine,Polish
2,aalen,masculine,Polish
4,abacysta,masculine,Polish
5,abak,masculine,Polish
7,abakawir,masculine,Polish
...,...,...,...
328184,zurrona,feminine,Spanish
328185,zutana,feminine,Spanish
328186,zwingliana,feminine,Spanish
328187,zwingliana,feminine,Spanish


## 3 Transform and Encode data
#### 3.1 Create function to add # to all nouns
Make all nouns the same length, this is done by preprending '#' to nouns until they are a uniform length


In [9]:
# get length of longest noun
max_length = clean['noun'].str.len().max() # 38

def add_filler(word):
    """ preprends n amount #'s to a word 
    based on a max_length"""
    if len(word) < max_length: # if word len() is less than the max len()
        diff = max_length - len(word) # we subtract the current word len() by the max len(): diff
        return '#' * diff + word # we prepend n(diff) amount of '#'s to the word and then return it
    return word # if len() of word is NOT less then the max len, then just return it

# apply function to every value in column 'noun'
clean['noun'] = clean['noun'].apply(add_filler)
print(clean.head(5))

                                     noun     gender    lang
0  #####################################a  masculine  Polish
2  #################################aalen  masculine  Polish
4  ##############################abacysta  masculine  Polish
5  ##################################abak  masculine  Polish
7  ##############################abakawir  masculine  Polish


#### 3.2 
get a distribution of all languages and genders and get the lowest count

In [10]:
grouped = clean.groupby(['gender','lang']).size().unstack()
lowest_value = int(grouped.min().min())
print(distribution(clean))
print('lowest_value = ', lowest_value)


lang       French  German  Polish   Spanish
gender                                     
feminine   3801.0  3624.0  2555.0  116375.0
masculine  3911.0  2485.0  2681.0  119263.0
neuter        NaN  2392.0  4883.0       NaN
lowest_value =  2392


### 3.3 Reduce all language sub dfs and genders to a uniform amount (lowest_value)

In [11]:
reduced_df = clean.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
print(distribution(reduced_df))

lang       French  German  Polish  Spanish
gender                                    
feminine   2392.0  2392.0  2392.0   2392.0
masculine  2392.0  2392.0  2392.0   2392.0
neuter        NaN  2392.0  2392.0      NaN


  reduced_df = clean.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value


#### 3.3 Define an encoding function
aim is to grab n amount of letters in a noun (starting from the end), and passing it into a one-hot encoder that will represent said letter(s) as a vector.
1. grab n amount of letters from each noun (this will be encoded)
2. intialize an encoder
3. encode n amount of letters
4. convert results into a dataframe (transformed df)
5. re index reduced df
6. return new dataframe which is made of : reduced df and transformed df

In [47]:
def encode(reduced_df, n=0):
    to_be_encoded = reduced_df['noun'].str[-n:] # grab n amount of letters start from the end to encode only
    ohe = OneHotEncoder(sparse_output=False) # initialize the encoder
    transformed = ohe.fit_transform(to_be_encoded.to_numpy().reshape(-1, 1)) # encode
    transformed_df = pd.DataFrame(transformed) # convert to a dataframe
    reduced_df.reset_index(inplace=True, drop=True) # reset indexes
    return pd.concat([reduced_df, transformed_df], axis=1) # create new dataframe of reduced df and transformed df

## 4 Train, test, fit, score

#### 4.1 create function to train,fit, test all languages with n amount of encoding



In [13]:
def multi_train_per_lang(df):
    results = defaultdict(lambda :defaultdict(list)) # to store scores from ML Models. dict->dict->list
    max_length = df['noun'].str.len().max() # get longest noun in whole dataset
    dfs = df_lang(df) # break df into smalled dfs based on language: spanish_df, french_df etc
    for sub_df in dfs: # for each language df
        for n in range(1, max_length + 1): # 
            encoded_df = encode(sub_df.df, n) # encode n amount of letters for the ith langauge df
            X, y = get_X_y(encoded_df) # X is the vector representationf for n amount of letters, y are the labels
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

            # now that we have a train test split, we can plug them into our ML models
            # KNN
            knn = KNeighborsClassifier(n_neighbors=3) # initialize a KNN class, 3 neighbors
            knn.fit(X_train, y_train) # train it 
            results['KNN'][n].append((knn.score(X_test, y_test), sub_df.lang)) # append score into results dict, along with name of sub df (French, German, etc)

            # Perceptron
            p = Perceptron(random_state=42) # initialize a Perceptron class, random state 42
            p.fit(X_train, y_train) # train it
            results['Perceptron'][n].append((p.score(X_test, y_test), sub_df.lang)) # append score to results dict, along with name of sub df (French, German, etc)

    return results # return s scores of l sub dfs in df, of n amount of letters encode, for c amount of ML models. Ex: results[c][n]: (s,l)

## 4.2 Experiment 1: train and test each language individually on n amount of letters encoded 
## takes ~5 mins to run!!

In [14]:
# all_langs = multi_train_per_lang(reduced_df)
# all_langs

### 4.3 Show results for KNN

In [15]:
# knn_results = all_langs['KNN']
# knn_df = pd.DataFrame.from_dict(knn_results, orient="index", columns=['Polish', 'German', 'French', 'Spanish'])
# knn_iterative_df = knn_df.applymap(lambda x: x[0]) # grab first element in the tuple (score, langauage)
# knn_iterative_df.to_csv('../data/knn_per_language_results.csv') # save to csv
# knn_iterative_df # display dataframe

### 4.4 Show results for Perceptron

In [16]:
# perceptron_results = all_langs['Perceptron']
# per_df = pd.DataFrame.from_dict(perceptron_results, orient="index", columns=['Polish', 'German', 'French', 'Spanish'])
# per_iterative_df = per_df.applymap(lambda x: x[0]) # grab first element in the tuple (score, langauage)
# per_iterative_df.to_csv('../data/perceptron_per_language_results.csv') # save as csv
# per_iterative_df # display df


## 4.5 Expermiment 2: train on one lang, and test on a different langauage (shuffle)

In [62]:
def multi_train_per_lang_shuffle(df, max_length):
    results = defaultdict(lambda :defaultdict(list))
    dfs = df_lang(df)
    training_data = defaultdict()
    testing_data = defaultdict()
    for sub_df in dfs: # for each language
        trans_df = encode(sub_df.df, max_length) # encode
        X, y = get_X_y(trans_df) # get X and y
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # split
        training_data[sub_df.lang] = (X_train, y_train) # add the x and y train for each language into a dict
        testing_data[sub_df.lang] = (X_test, y_test) # add the x and y testing for each language into a dict
        
    for train_lang in training_data: # for every language training
        for test_lang in testing_data: # for every language testing
       
            X_train__, y_train__ = training_data[train_lang]
            X_test__, y_test__ = testing_data[test_lang]
            try:
                knn = KNeighborsClassifier(n_neighbors=3) # initialize a KNN
                knn.fit(X_train__, y_train__) # fit it with training
                results['KNN'][train_lang] = (knn.score(X_test__, y_test__), test_lang ) # test it
            except:
                continue

            try:
                p = Perceptron(random_state=42) # initialize a Perceptron
                p.fit(X_train__, y_train__) # fit it with training
                results['Perceptron'][train_lang] = (p.score(X_test__, y_test__), test_lang ) # test it
            except:
                continue

    return results

### 4.6 Run experiment on only 4 letters encoded

In [64]:
shuffled = multi_train_per_lang_shuffle(reduced_df, 4)
shuffled


defaultdict(<function __main__.multi_train_per_lang_shuffle.<locals>.<lambda>()>,
            {'KNN': defaultdict(list,
                         {'French': (0.6864548494983278, 'French'),
                          'German': (0.7644927536231884, 'German'),
                          'Polish': (0.7198996655518395, 'Polish'),
                          'Spanish': (0.6513377926421404, 'Spanish')}),
             'Perceptron': defaultdict(list,
                         {'French': (0.7976588628762542, 'French'),
                          'German': (0.7920847268673356, 'German'),
                          'Polish': (0.8678929765886287, 'Polish'),
                          'Spanish': (0.7190635451505016, 'Spanish')})})

### 4.7 show results of KNN

In [65]:
knn_res = shuffled['KNN']
data = {key: [result[0] if result[1] == key else None for result in knn_res.values()] for key in knn_res.keys()}
knn_res_df = pd.DataFrame(data, index=data.keys())
knn_res_df.to_csv('../data/knn_res_lang_shuffle_4_encoding.csv') # save as csv
knn_res_df


Unnamed: 0,French,German,Polish,Spanish
French,0.686455,,,
German,,0.764493,,
Polish,,,0.7199,
Spanish,,,,0.651338


### 4.8 Show results of Perceptron

In [66]:
per_res = shuffled['Perceptron']
data = {key: [result[0] if result[1] == key else None for result in per_res.values()] for key in per_res.keys()}
per_res_df = pd.DataFrame(data, index=data.keys())
per_res_df.to_csv('../data/perceptron_res_lang_shuffle_4_encoding.csv') # save as csv
per_res_df

Unnamed: 0,French,German,Polish,Spanish
French,0.797659,,,
German,,0.792085,,
Polish,,,0.867893,
Spanish,,,,0.719064


## 4.9 Experiment 3: Train and test on WHOLE dataset as one

In [67]:
encoded_df = encode(reduced_df, 4) #  4 letters encoded
X, y = get_X_y(encoded_df) # X is the vector representationf for n amount of letters, y are the labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# KNN
knn = KNeighborsClassifier(n_neighbors=3) # initialize a KNN class, 3 neighbors
knn.fit(X_train, y_train) # train it 
knn_score = knn.score(X_test, y_test) # get score

# Perceptron
p = Perceptron(random_state=42) # initialize a Perceptron class, random state 42
p.fit(X_train, y_train) # train it
p_score = p.score(X_test, y_test) # get score


### 4.10 result of KNN being trained and tested on whole dataset

In [68]:
knn_score

0.8321488294314381

### 4.11 result of Perceptron being trained and tested on whole dataset

In [69]:
p_score

0.8342391304347826