In [2]:
%matplotlib inline 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
from sklearn import preprocessing

  _nan_object_mask = _nan_object_array != _nan_object_array


## Data Overview

Make a brief view about original data, including data head and type of each feature

In [3]:
data = pd.read_csv('Shakespeare.csv')
data.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [4]:
data.dtypes

Dataline              int64
Play                 object
PlayerLinenumber    float64
ActSceneLine         object
Player               object
PlayerLine           object
dtype: object

## Data Preprocess

Drop NaN data, we will not use that so we could just drop it.

In [5]:
data=data.dropna()
data.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil


Covert 'Play' and 'Player' to category data type.
Generate three new features using 'ActSceneLine'.

In [6]:
data['Play'] = data['Play'].astype("category")
data['Player'] = data['Player'].astype("category")
ActSceneLine=data['ActSceneLine'].str.split('.',expand=True)
data['Act'] = ActSceneLine[0]
data['Scene'] = ActSceneLine[1]
data['Line'] = ActSceneLine[2]
data.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,Act,Scene,Line
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",1,1,1
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",1,1,2
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,1,1,3
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,1,1,4
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,1,1,5


In [7]:
Word2Players = {}
data.shape

(105152, 9)

Try to make a statistical operation to find the most call word for each player

In [8]:
for i in range(data.shape[0]):
    if data.iloc[i]['Player'] not in Word2Players:
        Word2Players[data.iloc[i]['Player']] = collections.defaultdict(int)
    words = data.iloc[i]['PlayerLine'].split()
    for word in words:
        Word2Players[data.iloc[i]['Player']][word] += 1

In [9]:
for player in Word2Players:
    Word2Players[player] = sorted(Word2Players[player].items(),key=lambda x:x[1], reverse = True)

In [10]:
Word2Players

{'A Lord': [('Sir,', 1), ('it', 1), ('I.', 1), ('was', 1)],
 'A Patrician': [('This', 1),
  ('his', 1),
  ('do', 1),
  ('has', 1),
  ('burn', 1),
  ('fortune.', 1),
  ('nobler.', 1),
  ('man', 1),
  ('and', 1),
  ('You', 1),
  ("marr'd", 1),
  ('too.', 1),
  ('the', 1),
  ('Ay,', 1)],
 'A Player': [('the', 2),
  ('your', 2),
  ('not,', 1),
  ('honour', 1),
  ('lordship', 1),
  ('antic', 1),
  ('duty.', 1),
  ('Soto', 1),
  ('veriest', 1),
  ('our', 1),
  ("'twas", 1),
  ('ourselves,', 1),
  ('he', 1),
  ('in', 1),
  ('means.', 1),
  ('I', 1),
  ('that', 1),
  ('we', 1),
  ('world.', 1),
  ('contain', 1),
  ('my', 1),
  ('to', 1),
  ('think', 1),
  ('please', 1),
  ('can', 1),
  ('So', 1),
  ('Were', 1),
  ('lord:', 1),
  ('Fear', 1),
  ('accept', 1)],
 'AARON': [('the', 95),
  ('and', 64),
  ('I', 64),
  ('of', 51),
  ('to', 48),
  ('And', 47),
  ('in', 37),
  ('a', 36),
  ('you', 36),
  ('that', 35),
  ('my', 32),
  ('this', 29),
  ('for', 23),
  ('not', 22),
  ('is', 21),
  ('with', 

By above result, we found that the most call word may not be a good representation for a player.

We will not use this feature.
Here we try to use some easy-get feature.

In [11]:
data['CountWord'] = data['PlayerLine'].str.split().str.len()

In [12]:
data.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,Act,Scene,Line,CountWord
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",1,1,1,9
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",1,1,2,9
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,1,1,3,7
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,1,1,4,7
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,1,1,5,8


## Training Model and Classification

Generate training set and testing set, testing set size is 20% of data sets.

In [13]:
encoder = preprocessing.LabelEncoder()
encoder.fit(data['Play'])
data['Play'] = encoder.transform(data['Play'])
encoder.fit(data['Player'])
data['Player'] = encoder.transform(data['Player'])

In [14]:
feature = ["Play", "PlayerLinenumber", "Act","Scene", "Line", "CountWord"]
data_X = data[feature]
data_y = data["Player"]

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data_X, data_y, test_size=0.20, random_state=61)
plotdata = X_test.copy()

In [17]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10, random_state=61)
clf.fit(X_train, y_train)
predict = clf.predict(X_test)
print(feature)
print(clf.feature_importances_)
print(clf.score(X_test, y_test))

['Play', 'PlayerLinenumber', 'Act', 'Scene', 'Line', 'CountWord']
[ 0.21382236  0.2757517   0.07014942  0.07807706  0.26618672  0.09601274]
0.730635728211


In [18]:
from sklearn.preprocessing import StandardScaler

StandardScaler = StandardScaler()
X_train = StandardScaler.fit_transform(X_train)
X_test = StandardScaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [19]:
clf = RandomForestClassifier(n_estimators=20, random_state=61, n_jobs=-1)
clf.fit(X_train, y_train)
predict = clf.predict(X_test)
print(feature)
print(clf.feature_importances_)
print(clf.score(X_test, y_test))

['Play', 'PlayerLinenumber', 'Act', 'Scene', 'Line', 'CountWord']
[ 0.21269518  0.27677936  0.07126743  0.07794601  0.26460632  0.0967057 ]
0.74242784461


In [20]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
predict = clf.predict(X_test)
print(feature)
print(clf.feature_importances_)
print(clf.score(X_test, y_test))

['Play', 'PlayerLinenumber', 'Act', 'Scene', 'Line', 'CountWord']
[ 0.10581103  0.30229513  0.10644448  0.12116882  0.28622533  0.07805521]
0.743045979744
