In [2]:
import pandas as pd

In [116]:
df = pd.read_csv('data/gender_dataset.csv')
df['gender'] = df['gender'].str[:1]
df.head()

Unnamed: 0,surname,name,patronymic,gender
0,Репкина,Ирина,Александровна,Ж
1,Белоусов,Михаил,Александрович,М
2,Талышкин,Роман,Александрович,М
3,Ванюжина,Алевтина,Михайловна,Ж
4,Узун,Пётр,Дмитриевич,М


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 634362 entries, 0 to 634361
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   surname     634361 non-null  object
 1   name        634361 non-null  object
 2   patronymic  614221 non-null  object
 3   gender      634342 non-null  object
dtypes: object(4)
memory usage: 19.4+ MB


In [117]:
names_df = df[df[['name', 'gender']].duplicated(keep='first')].reset_index()[['name', 'gender']]
surnames_df = df[df[['surname', 'gender']].duplicated(keep='first')].reset_index()[['surname', 'gender']]
patronymics_df = df[df[['patronymic', 'gender']].duplicated(keep='first')].reset_index()[['patronymic', 'gender']]

In [34]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")


In [118]:
def preprocess(df, column):
    df[column] = df[column].str.lower()
    df[column] = df[column].str.replace('ё', 'е')

preprocess(names_df, 'name')
preprocess(surnames_df, 'surname')
preprocess(patronymics_df, 'patronymic')

In [119]:
surnames_df.loc[~(
    surnames_df.surname.str.endswith('ов') | 
    surnames_df.surname.str.endswith('ова') |
    surnames_df.surname.str.endswith('ев') | 
    surnames_df.surname.str.endswith('ева') |
    surnames_df.surname.str.endswith('ий') |
    surnames_df.surname.str.endswith('ая') |
    surnames_df.surname.str.endswith('ин') |
    surnames_df.surname.str.endswith('ина') |
    surnames_df.surname.str.endswith('ын') |
    surnames_df.surname.str.endswith('ына') |
    surnames_df.surname.str.contains('кызы') |
    surnames_df.surname.str.contains('кизи') |
    surnames_df.surname.str.contains('оглы') |
    surnames_df.surname.str.contains('оглы') |
    surnames_df.surname.str.contains('уулу') |
    surnames_df.surname.str.contains('оглу')
), ['gender']] = 'Н'

In [120]:
patronymics_df = patronymics_df[~patronymics_df.patronymic.isnull()]

In [32]:
def train(X, y, param_grids):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)    
    
    for model_name, params in param_grids.items():
        start_ts = datetime.now()
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(params['model'], params['param_grid'], cv=5, n_jobs=-3)
        grid_search.fit(X_train, y_train)
        print(f"Train took {datetime.now() - start_ts}")
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        print(f"Best Hyperparameters for {model_name}: {best_params}")
        
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy for {model_name}: {accuracy}\n")
    

## Names

In [41]:
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
tfidf_vectorizer = TfidfVectorizer()
hashing_vectorizer = HashingVectorizer(n_features=1000)

In [36]:
X = count_vectorizer.fit_transform(names_df['name'])
y = names_df['gender']

param_grids = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 9.0, 9.5, 10.0, 10.5, 11.0, 15.0, 100.0], 'penalty': ['l1', 'l2']}
    },
#    'SVM': {
#        'model': SVC(),
#        'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0], 'kernel': ['linear', 'rbf']}
#    }
}

train(X, y, param_grids)

Training Logistic Regression...
Train took 0:00:46.607270
Best Hyperparameters for Logistic Regression: {'C': 10.5, 'penalty': 'l2'}
Accuracy for Logistic Regression: 0.9941009735858357



In [45]:
X = tfidf_vectorizer.fit_transform(names_df['name'])
y = names_df['gender']

param_grids = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 15.0, 90.0, 100.0, 110.0, 150.0], 'penalty': ['l1', 'l2']}
    },
    #'SVM': {
    #    'model': SVC(),
    #    'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0], 'kernel': ['linear', 'rbf']}
    #}
}

train(X, y, param_grids)

Training Logistic Regression...
Train took 0:00:37.405379
Best Hyperparameters for Logistic Regression: {'C': 100.0, 'penalty': 'l2'}
Accuracy for Logistic Regression: 0.9851456270796533



In [47]:
X = hashing_vectorizer.fit_transform(names_df['name'])
y = names_df['gender']

param_grids = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'param_grid': {'C': [0.01, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 5.0, 10.0, 15.0, 100.0], 'penalty': ['l1', 'l2']}
    }
}

train(X, y, param_grids)

Training Logistic Regression...
Train took 0:00:53.661979
Best Hyperparameters for Logistic Regression: {'C': 0.15, 'penalty': 'l2'}
Accuracy for Logistic Regression: 0.9046543154089471



## Surname

In [121]:
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
tfidf_vectorizer = TfidfVectorizer()
hashing_vectorizer = HashingVectorizer(n_features=1000)

In [122]:
X = count_vectorizer.fit_transform(surnames_df['surname'])
y = surnames_df['gender']

param_grids = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'param_grid': {'C': [0.01, 0.1, 0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.75, 1.0], 'penalty': ['l1', 'l2']}
    },
#    'SVM': {
#        'model': SVC(),
#        'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0], 'kernel': ['linear', 'rbf']}
#    }
}

train(X, y, param_grids)

Training Logistic Regression...
Train took 0:00:37.612762
Best Hyperparameters for Logistic Regression: {'C': 0.75, 'penalty': 'l2'}
Accuracy for Logistic Regression: 0.9980303592789671



In [57]:
X = tfidf_vectorizer.fit_transform(surnames_df['surname'])
y = surnames_df['gender']

param_grids = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'param_grid': {'C': [0.01, 0.1, 0.5, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 2.0, 5.0, 10.0, 100.0], 'penalty': ['l1', 'l2']}
    },
#    'SVM': {
#        'model': SVC(),
#        'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0], 'kernel': ['linear', 'rbf']}
#    }
}

train(X, y, param_grids)

Training Logistic Regression...
Train took 0:01:01.519053
Best Hyperparameters for Logistic Regression: {'C': 1.1, 'penalty': 'l2'}
Accuracy for Logistic Regression: 0.9413026440622035



In [59]:
X = hashing_vectorizer.fit_transform(surnames_df['surname'])
y = surnames_df['gender']

param_grids = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 4.5, 5.0, 5.5, 6.0, 7.0, 10.0, 100.0], 'penalty': ['l1', 'l2']}
    },
#    'SVM': {
#        'model': SVC(),
#        'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0], 'kernel': ['linear', 'rbf']}
#    }
}

train(X, y, param_grids)

Training Logistic Regression...
Train took 0:00:34.576520
Best Hyperparameters for Logistic Regression: {'C': 5.0, 'penalty': 'l2'}
Accuracy for Logistic Regression: 0.668027884337747



## Patronymics

In [60]:
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
tfidf_vectorizer = TfidfVectorizer()
hashing_vectorizer = HashingVectorizer(n_features=1000)

In [68]:
X = count_vectorizer.fit_transform(patronymics_df['patronymic'])
y = patronymics_df['gender']

param_grids = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0], 'penalty': ['l1', 'l2']}
    },
#    'SVM': {
#        'model': SVC(),
#        'param_grid': {'C': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0], 'kernel': ['linear', 'rbf']}
#    }
}

train(X, y, param_grids)

Training Logistic Regression...
Train took 0:00:29.149031
Best Hyperparameters for Logistic Regression: {'C': 0.1, 'penalty': 'l2'}
Accuracy for Logistic Regression: 0.9990505226480836



tfidf и hashing уже бессмысленно проверять :)

## Saving Best Models

In [71]:
import joblib

### Names

In [69]:
names_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
X_names = names_vectorizer.fit_transform(names_df['name'])
y_names = names_df['gender']
predictor_names = LogisticRegression(C=10.5, penalty='l2')
predictor_names.fit(X_names, y_names)

In [70]:
names = [
    'Елена','Шазодаон','Вера','Зиедабону','Дарья','Латофат',
    'Маржона','Гули','Александр','Наталья','Алевтина','Любовь',
    'Анастасия','Шахноза','Василиса'
]

names_transformed = names_vectorizer.transform(names)

for name, gender in zip(names, predictor_names.predict(names_transformed)):
    print(name, gender)


Елена Ж
Шазодаон Ж
Вера Ж
Зиедабону Ж
Дарья Ж
Латофат Ж
Маржона Ж
Гули Ж
Александр М
Наталья Ж
Алевтина Ж
Любовь Ж
Анастасия Ж
Шахноза Ж
Василиса Ж


In [73]:
joblib.dump(predictor_names, 'models/predictor_names.pkl')
joblib.dump(names_vectorizer, 'models/names_vectorizer.pkl')

['models/names_vectorizer.pkl']

### Surnames

### Surnames

In [123]:
surnames_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
X_surnames = surnames_vectorizer.fit_transform(surnames_df['surname'])
y_surnames = surnames_df['gender']
predictor_surnames = LogisticRegression(C=0.75, penalty='l2')
predictor_surnames.fit(X_surnames, y_surnames)

In [125]:
surnames = [
    'Иванова','Иванов','Петренко','Алиджанов','Муратов','Алиева',
    'Барулин', 'Батин', 'Бескровный', 'Бескровная', 'Вишар',
    'Гелета', 'Гетьман', 'Дарбинян', 'Кондр', 'Алмазбек Кызы',
    'Талантбек Кизи', 'Чей-то Уулу', 'Чей-то Оглы'
]

surnames_transformed = surnames_vectorizer.transform(surnames)

for surname, gender in zip(surnames, predictor_surnames.predict_proba(surnames_transformed)):
    print(surname, gender)

Иванова [9.98587258e-01 1.34452790e-03 6.82136185e-05]
Иванов [1.1617905e-05 9.9993503e-01 5.3351804e-05]
Петренко [3.97670606e-06 3.27182134e-06 9.99992751e-01]
Алиджанов [2.73881222e-06 9.99955064e-01 4.21967718e-05]
Муратов [5.36906141e-07 9.99991696e-01 7.76709137e-06]
Алиева [9.58008473e-01 4.18606899e-02 1.30837267e-04]
Барулин [2.76704731e-04 9.96872588e-01 2.85070773e-03]
Батин [4.40399121e-04 9.97573800e-01 1.98580038e-03]
Бескровный [7.98806926e-06 3.49266865e-03 9.96499343e-01]
Бескровная [9.94336896e-01 5.62435084e-03 3.87536303e-05]
Вишар [2.68948893e-05 2.88565769e-04 9.99684539e-01]
Гелета [8.14668258e-04 1.08998108e-03 9.98095351e-01]
Гетьман [7.43064002e-05 2.20535254e-03 9.97720341e-01]
Дарбинян [2.29764803e-06 3.47365686e-02 9.65261134e-01]
Кондр [1.24220996e-04 9.74336054e-04 9.98901443e-01]
Алмазбек Кызы [0.88713732 0.11069057 0.00217211]
Талантбек Кизи [0.05862149 0.26282346 0.67855506]
Чей-то Уулу [4.49280814e-05 9.88941598e-01 1.10134739e-02]
Чей-то Оглы [0.0034

In [133]:
joblib.dump(predictor_surnames, 'models/predictor_surnames.pkl')
joblib.dump(surnames_vectorizer, 'models/surnames_vectorizer.pkl')

['models/surnames_vectorizer.pkl']

### Patronymic

In [131]:
patronymics_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
X_patronymics = patronymics_vectorizer.fit_transform(patronymics_df['patronymic'])
y_patronymics = patronymics_df['gender']
predictor_patronymics = LogisticRegression(C=0.1, penalty='l2')
predictor_patronymics.fit(X_patronymics, y_patronymics)

In [132]:
patronymics = [
    'Михайловна','Несимович','Курбанович','Михайловна','Хажимуратович','Жафяровна',
    'Саяд кызы', 'Сурени', 'Хоакин', 'Маис Кызы', 'Махомотшакир',
    'Элхан Оглы', 'Тахир Кызы'
]

patronymics_transformed = patronymics_vectorizer.transform(patronymics)

for patronymic, gender in zip(patronymics, predictor_patronymics.predict_proba(patronymics_transformed)):
    print(patronymic, gender)

Михайловна [0.99885082 0.00114918]
Несимович [1.79582719e-04 9.99820417e-01]
Курбанович [9.09503952e-05 9.99909050e-01]
Михайловна [0.99885082 0.00114918]
Хажимуратович [6.46778559e-05 9.99935322e-01]
Жафяровна [9.99308781e-01 6.91218746e-04]
Саяд кызы [0.98465894 0.01534106]
Сурени [0.22053893 0.77946107]
Хоакин [0.25065265 0.74934735]
Маис Кызы [0.98001423 0.01998577]
Махомотшакир [0.09706086 0.90293914]
Элхан Оглы [0.00633219 0.99366781]
Тахир Кызы [0.97958744 0.02041256]


In [134]:
joblib.dump(predictor_patronymics, 'models/predictor_patronymics.pkl')
joblib.dump(patronymics_vectorizer, 'models/patronymics_vectorizer.pkl')

['models/patronymics_vectorizer.pkl']

## Interfaces

In [175]:
from typing import Tuple, Dict

def predict_gender_by_full_name(surname: str = None, name: str = None, patronymic: str = None) -> Tuple[str, Dict]:
    if not surname and not name and not patronymic:
        raise ValueError('All parameters are none')
    probabilities = {'М': [], 'Ж': []}
    if surname:
        surname_cleaned = surname.lower().replace('ё', 'е')
        surname_vector = surnames_vectorizer.transform([surname_cleaned])
        surname_probabilities = predictor_surnames.predict_proba(surname_vector)[0]
        surname_predictions = {class_: proba for class_, proba in zip(predictor_surnames.classes_, surname_probabilities)}
        if max(surname_predictions, key=surname_predictions.get) != 'Н':
            probabilities['М'].append(surname_predictions['М'])
            probabilities['Ж'].append(surname_predictions['Ж'])
    if name:
        name_cleaned = name.lower().replace('ё', 'е')
        name_vector = names_vectorizer.transform([name_cleaned])
        name_probabilities = predictor_names.predict_proba(name_vector)[0]
        name_predictions = {class_: proba for class_, proba in zip(predictor_names.classes_, name_probabilities)}
        probabilities['М'].append(name_predictions['М'])
        probabilities['Ж'].append(name_predictions['Ж'])
    if patronymic:
        patronymic_cleaned = patronymic.lower().replace('ё', 'е')
        patronymic_vector = patronymics_vectorizer.transform([patronymic_cleaned])
        patronymic_probabilities = predictor_patronymics.predict_proba(patronymic_vector)[0]
        patronymic_predictions = {class_: proba for class_, proba in zip(predictor_patronymics.classes_, patronymic_probabilities)}
        probabilities['М'].append(patronymic_predictions['М'])
        probabilities['Ж'].append(patronymic_predictions['Ж'])
    predictions = {gender: sum(probability)/len(probability) for gender, probability in probabilities.items()}
    return max(predictions, key=predictions.get), predictions

In [236]:
predict_gender_by_full_name('Иванов', 'Светлана', 'Игоревна')

('Ж', {'М': 0.3341178438863414, 'Ж': 0.665864372178992})

In [233]:
predict_gender_by_full_name('Серых', 'Алевтина', 'Валерьевна')

('Ж', {'М': 0.0010846766672457588, 'Ж': 0.9989153233327542})

In [234]:
predict_gender_by_full_name('Петренко', 'Александра')

('Ж', {'М': 0.002936363525910017, 'Ж': 0.99706363647409})

In [235]:
predict_gender_by_full_name('Петренко', 'Александр')

('М', {'М': 0.999633109123903, 'Ж': 0.00036689087609698134})

In [224]:
def predict_gender_for_dataframe(
    df: pd.DataFrame, 
    surname_label: str = 'surname', 
    name_label: str = 'name', 
    patronymic_label: str = 'patronymic', 
    gender_label = 'gender',
    inplace: bool = True
) -> pd.DataFrame:
    
    if surname_label in df.columns:
        surnames = df.loc[~df[surname_label].isnull(), surname_label].str.lower().str.replace('ё', 'е')
        surname_predictions = pd.DataFrame(
            predictor_surnames.predict_proba(surnames_vectorizer.transform(surnames)), 
            columns=predictor_surnames.classes_,
            index=surnames.index
        )
        surname_predictions = surname_predictions.loc[surname_predictions['Н'] != surname_predictions.max(axis=1)]
    else:
        surname_predictions = pd.DataFrame([], columns=predictor_surnames.classes_)
    if name_label in df.columns:
        names = df.loc[~df[name_label].isnull(), name_label].str.lower().str.replace('ё', 'е')
        name_predictions = pd.DataFrame(
            predictor_names.predict_proba(names_vectorizer.transform(names)), 
            columns=predictor_names.classes_,
            index=names.index
        )
    else:
        name_predictions = pd.DataFrame([], columns=predictor_names.classes_)
    if patronymic_label in df.columns:
        patronymics = df.loc[~df[patronymic_label].isnull(), patronymic_label].str.lower().str.replace('ё', 'е')
        patronymic_predictions = pd.DataFrame(
            predictor_patronymics.predict_proba(patronymics_vectorizer.transform(patronymics)), 
            columns=predictor_patronymics.classes_,
            index=patronymics.index
        )
    else:
        patronymic_predictions = pd.DataFrame([], columns=predictor_patronymics.classes_)
    predictions = surname_predictions.join(name_predictions, how='outer', rsuffix='_name').join(patronymic_predictions, how='outer', lsuffix='_surname', rsuffix='_patronymic')
    predictions['М'] = predictions[['М_surname', 'М_name', 'М_patronymic']].mean(axis=1)
    predictions['Ж'] = predictions[['Ж_surname', 'Ж_name', 'Ж_patronymic']].mean(axis=1)
    if inplace:
        df[gender_label] = pd.Series(dtype=str)
        df.loc[predictions[predictions['М'] >= predictions['Ж']].index, gender_label] = 'М'
        df.loc[predictions[predictions['М'] < predictions['Ж']].index, gender_label] = 'Ж'
        return None
    else:
        new_df = df.copy()
        new_df[gender_label] = pd.Series(dtype=str)
        new_df.loc[predictions[predictions['М'] >= predictions['Ж']].index, gender_label] = 'М'
        new_df.loc[predictions[predictions['М'] < predictions['Ж']].index, gender_label] = 'Ж'
        return new_df

In [222]:
predict_gender_for_dataframe(df, gender_label='predicted_gender', inplace=False)

Unnamed: 0,surname,name,patronymic,gender,predicted_gender
0,Репкина,Ирина,Александровна,Ж,Ж
1,Белоусов,Михаил,Александрович,М,М
2,Талышкин,Роман,Александрович,М,М
3,Ванюжина,Алевтина,Михайловна,Ж,Ж
4,Узун,Пётр,Дмитриевич,М,М
...,...,...,...,...,...
634357,Мамедова,Наталья,Ивановна,Ж,Ж
634358,Каимова,Айсулу,,Ж,Ж
634359,Гиренко,Анна,Петровна,Ж,Ж
634360,Козакова,Екатерина,Петровна,Ж,Ж


In [225]:
predict_gender_for_dataframe(df, gender_label='predicted_gender')

In [226]:
df.head()

Unnamed: 0,surname,name,patronymic,gender,predicted_gender
0,Репкина,Ирина,Александровна,Ж,Ж
1,Белоусов,Михаил,Александрович,М,М
2,Талышкин,Роман,Александрович,М,М
3,Ванюжина,Алевтина,Михайловна,Ж,Ж
4,Узун,Пётр,Дмитриевич,М,М


In [231]:
from numpy import round
print('Final accuracy: ', round(sum(df.gender == df.predicted_gender) / len(df) * 100, 2), '%')

Final accuracy:  99.57 %
