In [1]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
logger = logging.getLogger(__name__)


class TextClassifier():

    def __init__(self, vectorizer, classifier=MultinomialNB()):
        classifier = SVC(kernel="rbf")
        # classifier = SVC(kernel="linear")
        self.classifier = classifier
        self.vectorizer = vectorizer

    def features(self, x):
        return self.vectorizer.transform(x)

    def fit(self, x, y):

        self.classifier.fit(self.features(x), y)

    def predict(self, x):

        return self.classifier.predict(self.features(x))

    def score(self, x, y):
        return self.classifier.score(self.features(x), y)

    def get_f1_score(self, x, y):
        return f1_score(y, self.predict(x), average='macro')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.externals import joblib
import os
import argparse

max_features = 200000
maxlen = 1000
embed_size = 300
EMBEDDING_FILE = "../inputs/wiki.zh.vec"
data_npz = "../inputs/data.npz"
features_npz = "../inputs/features.npz"
embed_npz = "../inputs/embedding.npz"

In [5]:
import pandas as pd
model_name = "baseline_model_dict.pkl"

# load train data
logger.info("start load data")
train_data_df = pd.read_csv("../inputs/train.csv")
validate_data_df = pd.read_csv("../inputs/vali.csv")

2018-09-17 15:00:41,989 [INFO] <MainProcess> (MainThread) start load data


In [6]:
content_train = train_data_df.iloc[:, 1]
columns = train_data_df.columns.values.tolist()
logger.info("start train feature extraction")
vectorizer_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=3, norm='l2',max_df=0.9,  use_idf=1,smooth_idf=1, sublinear_tf=1 )
vectorizer_tfidf.fit(content_train)
logger.info("complete train feature extraction models")

2018-09-17 15:01:19,113 [INFO] <MainProcess> (MainThread) start train feature extraction
2018-09-17 15:03:07,130 [INFO] <MainProcess> (MainThread) complete train feature extraction models


In [None]:
# model train
logger.info("start train model")
classifier_dict = dict()
for column in columns[2:]:
    label_train = train_data_df[column]
    text_classifier = TextClassifier(vectorizer=vectorizer_tfidf)
    logger.info("start train %s model" % column)
    text_classifier.fit(content_train, label_train)
    logger.info("complete train %s model" % column)
    classifier_dict[column] = text_classifier

logger.info("complete train model")

2018-09-17 15:03:24,450 [INFO] <MainProcess> (MainThread) start train model
2018-09-17 15:03:24,451 [INFO] <MainProcess> (MainThread) start train location_traffic_convenience model
2018-09-17 17:55:18,697 [INFO] <MainProcess> (MainThread) complete train location_traffic_convenience model
2018-09-17 17:55:18,697 [INFO] <MainProcess> (MainThread) start train location_distance_from_business_district model
2018-09-17 21:16:31,408 [INFO] <MainProcess> (MainThread) complete train location_distance_from_business_district model
2018-09-17 21:16:31,409 [INFO] <MainProcess> (MainThread) start train location_easy_to_find model
2018-09-18 00:52:47,826 [INFO] <MainProcess> (MainThread) complete train location_easy_to_find model
2018-09-18 00:52:47,827 [INFO] <MainProcess> (MainThread) start train service_wait_time model
2018-09-18 02:15:55,640 [INFO] <MainProcess> (MainThread) complete train service_wait_time model
2018-09-18 02:15:55,641 [INFO] <MainProcess> (MainThread) start train service_waiter

In [None]:
# validate model
content_validate = validate_data_df.iloc[:, 1]

logger.info("start seg validate data")
content_validate = seg_words(content_validate)
logger.info("complete seg validate data")

logger.info("start validate model")
f1_score_dict = dict()
for column in columns[2:]:
    label_validate = validate_data_df[column]
    text_classifier = classifier_dict[column]
    f1_score = text_classifier.get_f1_score(content_validate, label_validate)
    f1_score_dict[column] = f1_score

f1_score = np.mean(list(f1_score_dict.values()))
str_score = "\n"
for column in columns[2:]:
    str_score = str_score + column + ":" + str(f1_score_dict[column]) + "\n"

logger.info("f1_scores: %s\n" % str_score)
logger.info("f1_score: %s" % f1_score)
logger.info("complete validate model")

# save model
logger.info("start save model")
model_save_path = model_save_path
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

joblib.dump(classifier_dict, model_save_path + model_name)
logger.info("complete save model")