In [1]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import sys
sys.path.insert(0, '../src/')
import pandas as pd
import spacy
from collections import defaultdict
from data_cleaning import raw_json_to_clean_df, split_df, df_lang
from data_transformation import transform, get_X_y, distribution
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import OneHotEncoder

from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
import numpy as np
import matplotlib.pyplot as plt

## 1. Data Collection

We begin by scrapping Wiktionary.org for *feminine*, *masculine*, and *neuter* nouns in *Polish*, *German*, *Spanish*, and *French*.


In [3]:
# run webscrapper

## 2. Data Cleaning

The raw data in json format must be cleaned: removing nouns with *spaces*, *hyphens*, *numbers*, *abbreviations*, *initials* and finally those that are *proper nouns*.

In [4]:
# read json file and load it as a DataFrame
path = '../data/raw_scraped_data.json'
raw_df = pd.read_json(path)
raw_df

Unnamed: 0,noun,gender,lang
0,a,masculine,Polish
1,Aalborg,masculine,Polish
2,aalen,masculine,Polish
3,Aarhus,masculine,Polish
4,abacysta,masculine,Polish
...,...,...,...
328184,zurrona,feminine,Spanish
328185,zutana,feminine,Spanish
328186,zwingliana,feminine,Spanish
328187,zwingliana,feminine,Spanish


In [5]:
no_nums = raw_df[raw_df['noun'].str.contains('1|2|3|4|5|6|7|8|9|0')]
no_nums

Unnamed: 0,noun,gender,lang
723,AC45,feminine,French
724,AC72,feminine,French
2000,10-Eck,neuter,German
2002,11-Eck,neuter,German
2003,12-Eck,neuter,German
...,...,...,...
235408,impresora 3D,feminine,Spanish
299042,TV3,feminine,Spanish
299237,TV3,feminine,Spanish
299437,TV3,feminine,Spanish


In [6]:
# remove numbers, hypens, spaces, and periods
no_nums = raw_df[(~raw_df['noun'].str.contains('-| |\.|1|2|3|4|5|6|7|8|9|0'))]
print(no_nums.shape)
# remove full uppercase
no_caps = no_nums[(~no_nums['noun'].str.isupper())]
print(no_caps.shape)


(301257, 3)
(299192, 3)


Break df into each language to: 
- remove duplicates in each language
- remove nouns that begin with a capital letter (except for German) 

In [11]:
# separate df for each language
dfs = df_lang(no_caps)

# remove nouns that begin with captial letters (except German)
temp = []
for sub_df in dfs:
    if sub_df.lang != 'German':
        temp.append(sub_df.df[~sub_df.df['noun'].str.istitle()])
    else: # otherwise, for German, pass into SpaCy and filter out Proper nouns
        words = pd.Series(sub_df.df['noun']).tolist()
        text = " ".join(words)
        nlp_de = spacy.load("de_core_news_sm")
        nlp_de.max_length = len(text) 
        doc = nlp_de(text) 
        tokens = [token.text for token in doc if token.pos_ != 'PROPN']
        temp.append(sub_df.df[sub_df.df['noun'].isin(tokens)])
clean = pd.concat(temp)

# remove duplicates
# no_dups = no_title.drop_duplicates(subset=['noun', 'lang'], keep=False)
# print(no_dups.shape)

We know want to pass in our dataframe into SpaCy to filter out proper nouns 
(this should've been acheived on all but German already, but better to be safe than sorry)

In [12]:
clean

Unnamed: 0,noun,gender,lang
0,a,masculine,Polish
2,aalen,masculine,Polish
4,abacysta,masculine,Polish
5,abak,masculine,Polish
7,abakawir,masculine,Polish
...,...,...,...
328184,zurrona,feminine,Spanish
328185,zutana,feminine,Spanish
328186,zwingliana,feminine,Spanish
328187,zwingliana,feminine,Spanish


## 3 Transform and Encode Data
reduce data down to an even amount of examples per language and per gender

encode last 3 letters of each noun

In [13]:
# get length of longest noun
max_length = clean['noun'].str.len().max()

def add_filler(word):
    if len(word) < max_length:
        diff = max_length - len(word)
        return '#' * diff + word
    return word

# apply function to every value in column 'noun'

clean['noun'] = clean['noun'].apply(add_filler)
print(clean.shape)

(261970, 3)


In [16]:
grouped = clean.groupby(['gender','lang']).size().unstack()
lowest_value = int(grouped.min().min())
print('lowest_value = ', lowest_value)
print(distribution(clean))

lowest_value =  2392
lang       French  German  Polish   Spanish
gender                                     
feminine   3801.0  3624.0  2555.0  116375.0
masculine  3911.0  2485.0  2681.0  119263.0
neuter        NaN  2392.0  4883.0       NaN


In [21]:
def trans(dframe, n=0):
    reduced_df = dframe.groupby(['lang', 'gender'])['noun', 'gender', 'lang'].sample(n=lowest_value) # reduce each language and gender by lowest_value
    to_be_encoded = reduced_df['noun'].str[-n:] # grab n amount of letters start from the end to encode only
    ohe = OneHotEncoder(sparse=False) # initialize the encoder
    transformed = ohe.fit_transform(to_be_encoded.to_numpy().reshape(-1, 1)) # encode
    transformed_df = pd.DataFrame(transformed) # convert to a dataframe
    reduced_df.reset_index(inplace=True, drop=True) # reset indexes
    return pd.concat([reduced_df, transformed_df], axis=1) # create new dataframe of reduced df and transformed df

In [22]:
# trans_df = trans(clean, 3) # taking only the last three letters
# trans_df.head(5)

## 4 Train Data

## 4.1 Define X and y

In [None]:
# X, y = get_X_y(trans_df)

## 4.2 Split data into training and testing

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 4.3 K-NN

In [None]:
# knn = KNeighborsClassifier(n_neighbors=3)
# knn.fit(X_train, y_train)
# knn.score(X_test, y_test)

### 4.4 Perceptron

In [None]:
# p = Perceptron(random_state=42)
# p.fit(X_train, y_train)
# p.score(X_test, y_test)

Running K-NN and Percetron one by one from last letter until the whole word is encoded

In [None]:
# from collections import defaultdict
# def multi_train(df):
#     results = defaultdict(list)
#     max_length = df['noun'].str.len().max()
#     for n in range(1, max_length + 1):
#         trans_df = trans(df, n) # taking only n amount of letters
#         X, y = get_X_y(trans_df)
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


        
#         knn = KNeighborsClassifier(n_neighbors=3)
#         knn.fit(X_train, y_train)
#         results['KNN'].append(knn.score(X_test, y_test))


#         p = Perceptron(random_state=42)
#         p.fit(X_train, y_train)
#         p.score(X_test, y_test)
#         results['Perceptron'].append(p.score(X_test, y_test))

#     return results

In [None]:
# res = multi_train(clean)
# res

Just Spanish, being trained and tested on Spanish

In [28]:
def multi_train_per_lang(df):
    results = defaultdict(lambda :defaultdict(list))
    max_length = df['noun'].str.len().max()
    dfs = df_lang(df)
    for sub_df in dfs:
        for n in range(1, max_length + 1):
            trans_df = trans(sub_df.df, n) # taking only n amount of letters
            X, y = get_X_y(trans_df)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


            
            knn = KNeighborsClassifier(n_neighbors=3)
            knn.fit(X_train, y_train)
            results['KNN'][n].append((knn.score(X_test, y_test), sub_df.lang))


            p = Perceptron(random_state=42)
            p.fit(X_train, y_train)
            results['Perceptron'][n].append((p.score(X_test, y_test), sub_df.lang))

    return results

In [29]:
all_langs = multi_train_per_lang(clean)
all_langs

ValueError: Cannot subset columns with a tuple with more than one element. Use a list instead.

In [23]:
# modified version of the above block
def multi_train_scores_per_lang(df):
    knn_scores = []
    p_scores = []
    mlp_scores = []

    results = defaultdict(lambda :defaultdict(list))
    max_length = df['noun'].str.len().max()
    dfs = df_lang(df)
    
    for sub_df in dfs:
        for n in range(1, max_length + 1):
            trans_df = trans(sub_df.df, n) # taking only n amount of letters
            X, y = get_X_y(trans_df)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

            # KNN
            knn = KNeighborsClassifier(n_neighbors=3)
            knn.fit(X_train, y_train)
            results['KNN'][n].append((knn.score(X_test, y_test), sub_df.lang))
            knn_scores.append(knn.score(X_test, y_test))

            # Perceptron
            p = Perceptron(random_state=42)
            p.fit(X_train, y_train)
            results['Perceptron'][n].append((p.score(X_test, y_test), sub_df.lang))
            p_scores.append(p.score(X_test, y_test))

            # MLP
            mlp = MLPClassifier(hidden_layer_sizes=(1500,),
                        random_state=42,
                        learning_rate_init=0.01)
            mlp.fit(X_train,y_train)
            mlp_scores.append(mlp.score(X_test, y_test))

    return results, knn_scores, p_scores, mlp_scores

In [27]:
(results, knn_scores, p_scores, mlp_scores) = multi_train_scores_per_lang(clean)

ValueError: Cannot subset columns with a tuple with more than one element. Use a list instead.

In [None]:
# idk = pd.DataFrame.from_dict(all_langs, orient="index")
# idk
knn_results = all_langs['KNN']
knn_df = pd.DataFrame.from_dict(knn_results, orient="index", columns=['Polish', 'German', 'French', 'Spanish'])
knn_iterative_df = knn_df.applymap(lambda x: x[0])
knn_iterative_df.to_csv('../data/knn_results.csv')
knn_iterative_df

Unnamed: 0,Polish,German,French,Spanish
1,0.940111,0.658774,0.545455,0.833856
2,0.935933,0.755571,0.772205,0.863114
3,0.852368,0.779248,0.759666,0.809822
4,0.789694,0.755571,0.815047,0.818182
5,0.68663,0.772981,0.770115,0.673981
6,0.754875,0.659471,0.557994,0.609195
7,0.414345,0.598886,0.647858,0.569488
8,0.591226,0.491643,0.547544,0.60815
9,0.471448,0.472841,0.559039,0.551724
10,0.415738,0.369081,0.492163,0.593521


In [None]:
perceptron_results = all_langs['Perceptron']
per_df = pd.DataFrame.from_dict(perceptron_results, orient="index", columns=['Polish', 'German', 'French', 'Spanish'])
per_iterative_df = per_df.applymap(lambda x: x[0])
per_iterative_df.to_csv('../data/perceptron_results.csv')
per_iterative_df


Unnamed: 0,Polish,German,French,Spanish
1,0.942897,0.37117,0.695925,0.757576
2,0.949164,0.781337,0.829676,0.768025
3,0.952646,0.758357,0.84326,0.823406
4,0.905989,0.821727,0.807732,0.819227
5,0.826602,0.727716,0.76698,0.676071
6,0.665738,0.660864,0.597701,0.611285
7,0.612813,0.599582,0.639498,0.654127
8,0.564067,0.531337,0.547544,0.574713
9,0.512535,0.471448,0.562173,0.551724
10,0.523677,0.503482,0.490073,0.578892


In [None]:
def multi_train_per_lang_shuffle(df):
    results = defaultdict(lambda :defaultdict(list))
    # max_length = df['noun'].str.len().max()
    max_length = 3
    dfs = df_lang(df)
    something = defaultdict(lambda: defaultdict(list))
    for sub_df in dfs:
        for n in range(1, max_length + 1):
            trans_df = trans(sub_df.df, n) # taking only n amount of letters
            X, y = get_X_y(trans_df)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            something[sub_df.lang][n] = [X_train, X_test, y_train, y_test]

    # now that we have the X and Y for all iterations, for each language we will fit them

    
            
            # knn = KNeighborsClassifier(n_neighbors=3)
            # knn.fit(X_train, y_train)
            # results['KNN'][n].append((knn.score(X_test, y_test), sub_df.lang))


            # p = Perceptron(random_state=42)
            # p.fit(X_train, y_train)
            # p.score(X_test, y_test)
            # results['Perceptron'][n].append((p.score(X_test, y_test), sub_df.lang))

    return results

## 5 Graphs

In [None]:
# graphs for each language trained and tested on itself
# Plot the accuracies for each dataset
classifiers = ["KNN", "Perceptron", "MLP"]
x = np.arange(len(classifiers))  # the label locations
width = 0.4  # the width of the bars

plt.bar(x, knn_scores, width, color="DarkSlateGray", label="MLP performance")        # plot of KNN performance
plt.bar(x+width, p_scores, width, color="#80b3b3", label="baseline performance")     # plot of Perceptron performance
plt.bar(x+width, mlp_scores, width, color="DarkGreen", label="baseline performance")   # plot of MLP performance
plt.title('Accuracy of MLP classifier on different datasets')
plt.xlabel('Dataset')
plt.ylabel('Accuracy')
plt.xticks(x+width/2, classifiers)
plt.legend()
plt.show()