In [11]:
import os
import sys
sys.path.insert(0, '../src/')
from data_cleaning import raw_json_to_clean_df
from data_transformation import transform, get_X_y, distribution
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron


## 1. Data Collection

We begin by scrapping Wiktionary.org for *feminine*, *masculine*, and *neuter* nouns in *Polish*, *German*, *Spanish*, and *French*.


In [None]:
# run webscrapper

## 2. Data Cleaning

The raw data in json format must be cleaned: removing nouns with *spaces*, *hyphens*, *numbers*, *abbreviations*, *initials* and finally those that are *proper nouns*.

In [12]:
# read json file and load it as a DataFrame
path = '../data/raw_scraped_data.json'
df = raw_json_to_clean_df(path)

In [13]:
df.head(5)

Unnamed: 0,noun,lemma,gender,lang
0,aalen,aalir,masculine,Polish
1,abacysta,abacysta,masculine,Polish
2,abatacept,abatacept,masculine,Polish
3,abatacept,abatacept,masculine,French
4,abatacept,abatacept,masculine,Polish


## 3 Transform and Encode Data
reduce data down to an even amount of examples per language and per gender
encode last 3 letters of each noun

In [14]:
trans_df = transform(df)
trans_df.head(5)

  lowest = distribution(df).min()[0]
  reduced_df = df.groupby(['lang', 'gender'])['noun', 'lemma', 'gender', 'lang'].sample(n=lowest)


Unnamed: 0,noun,lemma,gender,lang,0,1,2,3,4,5,...,5848,5849,5850,5851,5852,5853,5854,5855,5856,5857
0,badauderie,Badauderie,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,féculerie,féculeria,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,rando,rar,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,panthéiste,panthéiste,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tabagie,tabagi,feminine,French,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4 Train Data

## 4.1 Define X and y

In [15]:
X, y = get_X_y(trans_df)

## 4.2 Split data into training and testing

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 4.3 K-NN

In [17]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.8468354430379746

### 4.4 Perceptron

In [18]:
p = Perceptron(random_state=42)
p.fit(X_train, y_train)
p.score(X_test, y_test)

0.8981012658227848