In [2]:
import pandas as pd
import os
import numpy as np
from math import cos, sin, atan2, sqrt, pi, radians, degrees, asin
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, MultiLabelBinarizer, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_squared_log_error, accuracy_score
from sklearn.model_selection import train_test_split

- Problem Transformation Methods (转换策略)
    - Binary Relevance（二元关联）
    - Classifier Chains (分类器链)
    - Label Powerset (LP法)
- Algorithm Adaptation (算法适应性策略)
    - [BP-MLL](https://github.com/vanHavel/bp-mll-tensorflow/blob/master/full_example.py)
    - [MLKNN]()

### 数据获取

In [3]:
df = pd.read_csv('../data/info-final.csv')
df.columns

Index(['c_name', 'c_nature', 'c_scale', 'w_place', 'w_field', 'w_experience',
       'education', 's_min', 's_max', 'vacancies'],
      dtype='object')

In [4]:
def sample_data(df_unsampled):
    df_other = df_unsampled.drop(df[df['c_nature'] == '民营'].index, axis = 0)
    df0 = df_unsampled.groupby(['c_nature','c_scale']).get_group(('民营','20人以下'))
    df1 = df_unsampled.groupby(['c_nature','c_scale']).get_group(('民营','20-99人')).sample(n=500)
    df2 = df_unsampled.groupby(['c_nature','c_scale']).get_group(('民营','100-499人')).sample(n=500)
    df3 = df_unsampled.groupby(['c_nature','c_scale']).get_group(('民营','500-999人')).sample(n=500)
    df4 = df_unsampled.groupby(['c_nature','c_scale']).get_group(('民营','1000-9999人')).sample(n=500)
    df5 = df_unsampled.groupby(['c_nature','c_scale']).get_group(('民营','10000人以上'))
    df_sampled = pd.concat([df_other, df0, df1, df2, df3, df4, df5], axis = 0)
    return df_sampled

In [5]:
company_info = pd.DataFrame(df, columns = ['c_nature','c_scale'])
features = df.drop(['c_name', 'c_nature', 'c_scale', 'w_field', 'vacancies'], axis = 1)

In [6]:
le = LabelEncoder()
features.w_place = le.fit_transform(features.w_place)
features.w_experience = le.fit_transform(features.w_experience)
features.education = le.fit_transform(features.education)
features.s_min = le.fit_transform(features.s_min)
features.s_max = le.fit_transform(features.s_max)
company_info.c_nature = le.fit_transform(company_info.c_nature)
company_info.c_scale = le.fit_transform(company_info.c_scale)

In [7]:
ohe = OneHotEncoder()
features = ohe.fit_transform(features).toarray()
company_info = ohe.fit_transform(company_info).toarray()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, company_info, test_size=0.2, random_state=2)

## Problem Transformation Methods (转换策略)
### Binary Relevance（二元关联）

In [9]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [171]:
# Decision Tree
classifier = BinaryRelevance(DecisionTreeClassifier())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.35876132930513593

In [155]:
# Naive Bayes
classifier = BinaryRelevance(GaussianNB())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.29632426988922456

In [164]:
# SVM
classifier = BinaryRelevance(SVC())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)
# metrics.f1_score(y_test, y_pred, average="micro")

0.32250755287009064

In [156]:
# Random Forest Classifier
classifier = BinaryRelevance(RandomForestClassifier())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.3680765357502518

### Classifier Chains (分类器链)

In [10]:
from skmultilearn.problem_transform import ClassifierChain

In [172]:
# Decision Tree
classifier = ClassifierChain(DecisionTreeClassifier())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.4521651560926485

In [159]:
# Naive Bayes
classifier = ClassifierChain(GaussianNB())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.3104229607250755

In [165]:
# SVM
classifier = ClassifierChain(SVC())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.45845921450151056

In [160]:
# Random Forest Classifier
classifier = ClassifierChain(RandomForestClassifier())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.46727089627391744

### Label Powerset (LP法)

In [11]:
from skmultilearn.problem_transform import LabelPowerset

In [173]:
# Decision Tree
classifier = LabelPowerset(DecisionTreeClassifier())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.4770896273917422

In [162]:
# Naive Bayes
classifier = LabelPowerset(GaussianNB())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.3053877139979859

In [166]:
# SVM
classifier = LabelPowerset(SVC())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.48464249748237664

In [163]:
# Random Forest Classifier
classifier = LabelPowerset(RandomForestClassifier())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

0.4783484390735146

## Problem Transformation Methods (转换策略)
### MLKNN

In [17]:
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV

In [None]:
# MLKNN
classifier = MLkNN(k=10)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test, y_pred)

In [None]:
# GridSearchCV
parameters = {'k': range(1,10), 's': [0.5, 0.7, 1.0]}
score = 'accuracy'

clf = GridSearchCV(MLkNN(), parameters, scoring=score)
clf.fit(X_train, y_train)
print (clf.best_params_, clf.best_score_)
y_pred = clf.predict(X_test)

accuracy_score(y_test, y_pred)

### BP_MLL

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from bpmll.bpmll import bp_mll_loss

In [24]:
n = X_train.shape[0]
dim_no = X_train.shape[1]
class_no = y_train.shape[1]

# create simple mlp
model = Sequential()
model.add(Dense(128, input_dim=dim_no, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(class_no, activation='sigmoid'))
model.compile(loss=bp_mll_loss, optimizer='adagrad', metrics=['accuracy'])
# train
model.fit(X_train, y_train, epochs=10)
# evaluation
y_pred = model.predict(X_test)
true_labels = np.argmax(y_test, axis=1)
pred_labels = np.argmax(y_pred, axis=1)
accuracy_score(true_labels, pred_labels)

Train on 15886 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.6737160120845922

## Results
- Binary Relevance（二元关联）  
DT - 0.35876132930513593  
NB - 0.29632426988922456  
SVM - 0.32250755287009064  
RFC - 0.3703423967774421
- Classifier Chains (分类器链)  
DT - 0.4521651560926485  
NB - 0.3104229607250755  
SVM - 0.45845921450151056  
RFC - 0.46727089627391744
- Label Powerset (LP法)  
DT - 0.4770896273917422  
NB - 0.3053877139979859  
SVM - 0.48464249748237664  
RFC - 0.4783484390735146
