In [2]:
import pandas as pd
import numpy as np
import jieba
import jieba.analyse
import pickle as pkl
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, classification_report
import sklearn.metrics as metrics
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.sparse import csr_matrix, hstack

In [4]:
def get_data():
    train = pd.read_csv('./../data/train/train.csv')
    test = pd.read_csv('./../data/test_public/test_public.csv')
    train = train.sample(frac=1) # ?
    train = train.reset_index(drop=True)
    
    data = pd.concat([train, test])

    lbe = LabelEncoder()
    lbe.fit(train['subject'])
    nb_classes = len(list(lbe.classes_))

    pkl.dump(lbe, open('label_encoder.pkl', 'wb'))
    subject = lbe.transform(train['subject'])

    y = []
    for i in train['sentiment_value'].values:
        y.append(i)
    y1 = []
    for i in subject:
        y1.append(i)
    print(np.array(y).reshape(-1, 1)[:,0])
    return data, train.shape[0], np.array(y).reshape(-1, 1)[:, 0], test['content_id'], np.array(y1).reshape(-1, 1)[:, 0]

In [70]:
def processing_data(data):
#     jieba.load_userdict('./../data/word_dict.txt')
    words = jieba.cut(data)
    return ' '.join(words)

In [71]:
def pre_process():
    data,nrw_train,y,test_id,y1 = get_data()
    data['cut_comment'] = data['content'].map(processing_data)
    print('TfidfVectorizer')
    tf = TfidfVectorizer(ngram_range=(1,2),analyzer='char')
    discuss_tf = tf.fit_transform(data['cut_comment'])
    print('HashingVectorizer')
    ha = HashingVectorizer(ngram_range=(1,1),lowercase=False)
    discuss_ha = ha.fit_transform(data['cut_comment'])
    data = hstack((discuss_tf,discuss_ha)).tocsr()
    return data[:nrw_train],data[nrw_train:],y,test_id,y1

In [72]:
X = pkl.load(open('X.pkl', 'rb'))
test = pkl.load(open('test.pkl', 'rb'))
y = pkl.load(open('y.pkl', 'rb'))
test_id = pkl.load(open('test_id.pkl', 'rb'))
y1 = pkl.load(open('y1.pkl', 'rb'))
# X,test,y,test_id,y1= pre_process()
# pkl.dump(X, open('X.pkl', 'wb'))
# pkl.dump(test, open('test.pkl', 'wb'))
# pkl.dump(y, open('y.pkl', 'wb'))
# pkl.dump(test_id, open('test_id.pkl', 'wb'))
# pkl.dump(y1, open('y1.pkl', 'wb'))

[ 0  0 -1 ...,  0  1 -1]
TfidfVectorizer
HashingVectorizer


In [73]:
N = 10
kf = StratifiedKFold(n_splits=N, random_state=2018).split(X,y)

In [74]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=0.5)
# from sklearn.linear_model import SGDClassifier
# clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter= 5, random_state=42)
# clf = svm.SVC(C=1)
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier()
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB()

In [75]:
y_train_oofp = np.zeros_like(y, dtype='float64')
y_train_oofp1 = np.zeros_like(y, dtype='float64')

y_test_oofp = np.zeros((test.shape[0], N))
y_test_oofp_1 = np.zeros((test.shape[0], N))

In [76]:
def micro_avg_f1(y_true, y_pred):
    return metrics.f1_score(y_true, y_pred, average='micro')

In [77]:
acc = 0
vcc = 0
for i ,(train_fold,test_fold) in enumerate(kf):
    print("i = ", i)
    print('-'*20 + "trainning" + '-'*20)
    X_train, X_validate, label_train, label_validate,  label_1_train, label_1_validate,= X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold], y1[train_fold], y1[test_fold]
    clf.fit(X_train, label_train)
    print('-'*20 + "predicting" + '-'*20)
    val_ = clf.predict(X_validate)
    y_train_oofp[test_fold] = val_
    print('sentiment_value_f1:%f' % micro_avg_f1(label_validate, val_))
    acc += micro_avg_f1(label_validate, val_)
    result = clf.predict(test)
    y_test_oofp[:, i] = result
    clf.fit(X_train, label_1_train)
    val_1 = clf.predict(X_validate)
    y_train_oofp1[test_fold] = val_
    vcc += micro_avg_f1(label_1_validate, val_1)
    result = clf.predict(test)
    y_test_oofp_1[:, i] = result
print(acc/N)
print(vcc/N)

i =  0
--------------------trainning--------------------
--------------------predicting--------------------
sentiment_value_f1:0.704819
i =  1
--------------------trainning--------------------
--------------------predicting--------------------
sentiment_value_f1:0.691457
i =  2
--------------------trainning--------------------
--------------------predicting--------------------
sentiment_value_f1:0.703518
i =  3
--------------------trainning--------------------
--------------------predicting--------------------
sentiment_value_f1:0.701508
i =  4
--------------------trainning--------------------
--------------------predicting--------------------
sentiment_value_f1:0.696482
i =  5
--------------------trainning--------------------
--------------------predicting--------------------
sentiment_value_f1:0.713568
i =  6
--------------------trainning--------------------
--------------------predicting--------------------
sentiment_value_f1:0.705231
i =  7
--------------------trainning------------

In [78]:
lbl = pkl.load(open('label_encoder.pkl','rb'))
res_2 = []
for i in range(y_test_oofp_1.shape[0]):
    tmp = []
    for j in range(N):
        tmp.append(int(y_test_oofp_1[i][j]))
    word_counts = Counter(tmp)
    yes = word_counts.most_common(1)
    res_2.append(lbl.inverse_transform(yes[0][0]))

In [79]:
res = []
for i in range(y_test_oofp.shape[0]):
    tmp = []
    for j in range(N):
        tmp.append(y_test_oofp[i][j])
    res.append(max(set(tmp), key=tmp.count))

In [80]:
print(len(res))
result = pd.DataFrame()
result['content_id'] = list(test_id)

result['subject'] = list(res_2)
result['subject'] = result['subject']

result['sentiment_value'] = list(res)
result['sentiment_value'] = result['sentiment_value'].astype(int)

result['sentiment_word'] = ''
result.to_csv('submit.csv',index=False)

2364
