In [1]:
import numpy as np
np.random.seed(37) # 使得每次运行得到的随机数都一样

In [2]:
# 1, 准备数据集，此次数据集来源于nltk.corpus内置的names文件
from nltk.corpus import names
male_names=[(name, 'male') for name in names.words('male.txt')]
female_names=[(name,'female') for name in names.words('female.txt')]
print(len(male_names)) # 2943
print(len(female_names)) # 5001
# 数据集中有2943个男性名字，5001个女性名字

# 看看男性和女性的名字都是哪些。。。。
print(male_names[:5])
print(female_names[:3])

# 将这些名字组成的list合并成一个数据集，第一列是名字，即features，第二列是性别，即label
dataset=np.array(male_names+female_names)
print(dataset.shape) # (7944, 2)没错

2943
5001
[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male')]
[('Abagael', 'female'), ('Abagail', 'female'), ('Abbe', 'female')]
(7944, 2)


In [3]:
# 处理数据集
# 我们难以确定到底是姓名后面的几个字母才和性别相关性最大，
# 故而此处把后面的1-4个字母都取出来作为一个特征列
# 用pandas 貌似更容易一些
import pandas as pd
dataset_df=pd.DataFrame(dataset,columns=['name','sex'])
# print(dataset_df.info())
# print(dataset_df.head()) # 检查没有问题

for i in range(1,5): # 分别截取每个名字的后面i个字母
    dataset_df['len'+str(i)]=dataset_df.name.map(lambda x: x[-i:].lower())
    
print(dataset_df.head())# 检查没有问题

    name   sex len1 len2 len3  len4
0  Aamir  male    r   ir  mir  amir
1  Aaron  male    n   on  ron  aron
2  Abbey  male    y   ey  bey  bbey
3  Abbie  male    e   ie  bie  bbie
4  Abbot  male    t   ot  bot  bbot


In [4]:
# 分别构建分类器，并训练后再测试集上看看效果
dataset=dataset_df.values
np.random.shuffle(dataset)
rows=int(len(dataset)*0.7) # 70%为train set
train_set,test_set=dataset[:rows],dataset[rows:]

from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
for i in range(1,5): # 对每一列特征分别建模并训练
    train_X,train_y=train_set[:,i+1],train_set[:,1]
    train=[({'feature':feature},label) for (feature,label) in zip(train_X,train_y)] 
    # 后面的NaiveBayesClassifier 在train的时候需要()组成的list
    clf=NaiveBayesClassifier.train(train)
    
    # 查看该模型在test set上的表现
    test_X,test_y=test_set[:,i+1],test_set[:,1]
    test=[({'feature':feature},label) for (feature,label) in zip(test_X,test_y)] 
    acc=nltk_accuracy(clf,test)
    
    print('Number of suffix: {}, accuracy on test set: {:.2f}%'
          .format(i, 100*acc))

Number of suffix: 1, accuracy on test set: 76.05%
Number of suffix: 2, accuracy on test set: 77.89%
Number of suffix: 3, accuracy on test set: 75.80%
Number of suffix: 4, accuracy on test set: 71.56%
