In [19]:
# coding=utf-8

import os
import pandas as pd
import nltk
from collections import Counter
import numpy as np

ROOTDIR = os.path.abspath(os.path.abspath(os.path.join(os.getcwd(), "..")))
DATADIR = os.path.join(ROOTDIR, 'data')
MODELDIR = os.path.join(ROOTDIR, 'model')

In [117]:
from gensim import corpora
import json
import jieba
from tqdm import tqdm 
from itertools import chain

def split_sentence(article):
    p_b = 0
    p_a = 0
    re = []
    for i in range(len(article)):
        if article[i] in ['。','？','！','…','；','，','.','?','!',';',':',',']:
            p_a = i
            if article[p_b:p_a] != '':
                re.append(article[p_b:p_a])
            p_b = i + 1
    if len(re) == 0:
        return [article]
    else:
        return re
    
def clean_text(tokenized_list, sw, punct):
    new_list = []
    print('clean text ...')
    for doc in tqdm(tokenized_list):
        cleaned_doc = [token.lower() for token in doc if token.lower() not in chain(punct, sw)]
        if len(cleaned_doc) > 3:
            new_list.append(cleaned_doc)
    return new_list

def make_cleaned(tokenized):
    dictionary = corpora.Dictionary(tokenized)
    print ('before:\t',len(dictionary))
    only_once_key=[]
    for key in dictionary.iterkeys():
        if dictionary.dfs[key]==1:
            only_once_key.append(key)
    dictionary.filter_tokens(bad_ids=only_once_key)
    print ('after:\t',len(dictionary))
    
    frequent_tokenized = []
    print('remove low frequnce ...')
    for sentence in tqdm(tokenized):
        ind_list = dictionary.doc2idx(sentence)
        words = []
        for i,ind in enumerate(ind_list):
            if ind != -1:
                words.append(sentence[i])
        frequent_tokenized.append(words)
    punct =  '《》，。？/：；‘’“”{}【】、|—·！~ \n'+'!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    sw = []
    with open (DATADIR + "/cn_stopwords.txt", 'r', encoding='utf8') as f:
        sw = f.read().split('\n')
    
    return clean_text(frequent_tokenized, sw, punct)

def get_tokenized_txt(path,enc):
    tokenized = []
    for file in tqdm(os.listdir(path)):
        with open(path+ '\\' + file, 'r',encoding = enc) as f:
            try:
                lines = f.read().split('\n')
            except UnicodeDecodeError:
                continue
            else:
                for line in lines:
                    if line != '\n':
                        sentences = split_sentence(line)
                        for s in sentences:
                            tokenized.append(list(jieba.cut(s, HMM=False)))
    return tokenized

def get_tokenized_json(path):
    tokenized = []
    print('get tokenized ...')
    for file in tqdm(os.listdir(path)):
        with open(path+ '\\' + file, 'r', encoding='utf8') as f:
            for line in f.readlines():
                sentences = []
                paragraphs = json.loads(line)['text'].split('\n\n')[1:]
                for paragraph in paragraphs:
                    if paragraph == ' ' or paragraph == '' or paragraph == '\n':
                        break
                    tokenized.append(list(jieba.cut(paragraph, HMM=False)))
    return tokenized

In [95]:
d2_path = DATADIR  + '/轻小说'
d2_cleaned = make_cleaned(get_tokenized_txt(d2_path,'utf-8'))

100%|████████████████████████████████████████████████████████████████████████████████| 123/123 [00:10<00:00, 11.99it/s]
  0%|                                                                                       | 0/419468 [00:00<?, ?it/s]

before:	 48355
after:	 33127
remove low frequnce ...


100%|██████████████████████████████████████████████████████████████████████| 419468/419468 [00:01<00:00, 233443.09it/s]
  0%|▎                                                                        | 1483/419468 [00:00<00:28, 14777.27it/s]

clean text ...


100%|███████████████████████████████████████████████████████████████████████| 419468/419468 [00:30<00:00, 13929.06it/s]


In [96]:
wiki_path = DATADIR  + '/维基'
wiki_cleaned = make_cleaned(get_tokenized_json(wiki_path))

  0%|                                                                                           | 0/50 [00:00<?, ?it/s]

get tokenized ...


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:34<00:00,  1.44it/s]
  0%|                                                                                       | 0/105587 [00:00<?, ?it/s]

before:	 179191
after:	 113665
remove low frequnce ...


100%|███████████████████████████████████████████████████████████████████████| 105587/105587 [00:03<00:00, 30532.43it/s]
  0%|                                                                           | 120/105587 [00:00<01:29, 1179.87it/s]

clean text ...


100%|████████████████████████████████████████████████████████████████████████| 105587/105587 [01:40<00:00, 1048.39it/s]


In [97]:
weibo_path = DATADIR  + '/口语'
weibo_cleaned = make_cleaned(get_tokenized_txt(weibo_path,'utf-8'))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:20<00:00,  2.04s/it]
  4%|██▉                                                                    | 31580/757965 [00:00<00:02, 313441.81it/s]

before:	 49813
after:	 38964
remove low frequnce ...


100%|██████████████████████████████████████████████████████████████████████| 757965/757965 [00:03<00:00, 224320.52it/s]
  0%|▏                                                                        | 2131/757965 [00:00<00:35, 21149.72it/s]

clean text ...


100%|███████████████████████████████████████████████████████████████████████| 757965/757965 [00:34<00:00, 21661.74it/s]


In [98]:
tra_path = DATADIR  + '/文学'
tra_cleaned = make_cleaned(get_tokenized_txt(tra_path,'utf-8'))

100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:24<00:00,  1.26it/s]
  0%|                                                                                      | 0/1050038 [00:00<?, ?it/s]

before:	 94478
after:	 70427
remove low frequnce ...


100%|████████████████████████████████████████████████████████████████████| 1050038/1050038 [00:04<00:00, 224405.72it/s]
  0%|                                                                        | 1560/1050038 [00:00<01:07, 15482.63it/s]

clean text ...


100%|█████████████████████████████████████████████████████████████████████| 1050038/1050038 [01:08<00:00, 15308.08it/s]


In [99]:
from gensim import models

all_text_data = d2_cleaned + weibo_cleaned + wiki_cleaned + tra_cleaned

w2v_model = models.Word2Vec(all_text_data,
                        size = 300,
                        window = 5,
                        min_count = 1,
                        sg = 0,
                        alpha = 0.025,
                        iter=10,
                        batch_words = 10000)

In [100]:
w2v_model.save(MODELDIR + "/word2vec_style.model")

In [101]:
dictionary = corpora.Dictionary(d2_cleaned + weibo_cleaned + wiki_cleaned + tra_cleaned)
dictionary.save(MODELDIR + '/dictionary_style.dict')

In [None]:
#加载模型
from gensim import models

w2v_model = models.Word2Vec.load(MODELDIR + "/word2vec_style.model")
dictionary = corpora.Dictionary.load(MODELDIR + '/dictionary_style.dict')

In [102]:
vec_d2 = np.array([w2v_model.wv[q].sum(axis=0)/len(q) for q in d2_cleaned])
vec_weibo = np.array([w2v_model.wv[q].sum(axis=0)/len(q) for q in weibo_cleaned])
vec_wiki = np.array([w2v_model.wv[q].sum(axis=0)/len(q) for q in wiki_cleaned])
vec_tra = np.array([w2v_model.wv[q].sum(axis=0)/len(q) for q in tra_cleaned])

# 文学性分析

In [103]:
print(vec_tra.shape)
print(vec_wiki.shape)
print(vec_d2.shape)
print(vec_weibo.shape)

(440016, 300)
(103659, 300)
(189782, 300)
(183608, 300)


In [104]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

X = np.concatenate((vec_d2,vec_weibo,vec_wiki,vec_tra),axis = 0)
y =  np.concatenate((np.zeros(vec_d2.shape[0] + vec_weibo.shape[0] + vec_wiki.shape[0]),np.ones(vec_tra.shape[0])),axis = 0)

print('before undersample:\t',sorted(Counter(y).items()))

X_resampled, y_resampled = rus.fit_resample(X, y)

print('after undersample:\t',sorted(Counter(y_resampled).items()))

y_resampled = y_resampled.reshape(-1,1)

before undersample:	 [(0.0, 477049), (1.0, 440016)]
after undersample:	 [(0.0, 440016), (1.0, 440016)]


In [105]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

seed = 42
test_size = 0.33

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=test_size, random_state=seed)

In [106]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr = lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

# check the accuracy on the training set
lr.score(X_test, y_test)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8540103508475919

In [154]:
def cal_score(article,classifier,dictionary,w2v_model):
    sentences = split_sentence(article)
    
    tokenized = []
    for sentence in sentences:
        tokenized.append(list(jieba.cut(sentence, HMM=False)))
    cleaned = []
    for sentence in tokenized:
        ind_list = dictionary.doc2idx(sentence)
        words = []
        for i,ind in enumerate(ind_list):
            if ind != -1:
                words.append(sentence[i])
        if len(words)!=0:
            cleaned.append(words)
    
    vec = np.array([w2v_model.wv[q].sum(axis=0)/len(q) for q in cleaned])
    y_pred = classifier.predict(vec)
    return y_pred.sum()/y_pred.size

In [155]:
s = '这看似宁静的大洋里隐藏着不安；那也许是远在洋底的不安的灵魂们；这广阔无垠的牧场起伏不定；多少人永远安身的墓地呀；他们为了自己的梦想而来；为了自己的梦想而死；他们留在这里就像是留在了自己的梦乡；他们翻来覆去；搅得无际的洋面波涛汹涌；这太平洋是世界的心胸；它包裹着我们藉以生存的一切；印度洋和大西洋不过是它的两条手臂；加利福尼亚的护堤更是它不屑摧毁的孩子的沙器'
cal_score(s,lr,dictionary,w2v_model)

0.5833333333333334

In [156]:
s = '"那是自然了，我自己的命运当然要由我自己掌握，区区疾病是不能奈我何的。"'
cal_score(s,lr,dictionary,w2v_model)

0.6666666666666666

In [157]:
s = '为什么会变成这样呢……第一次有了喜欢的人。有了能做一辈子朋友的人。两件快乐事情重合在一起。而这两份快乐，又给我带来更多的快乐。得到的，本该是像梦境一般幸福的时间……但是，为什么，会变成这样呢……'
cal_score(s,lr,dictionary,w2v_model)

0.1111111111111111

# 二次元分析

In [138]:
rus = RandomUnderSampler(random_state=0)

X = np.concatenate((vec_weibo,vec_wiki,vec_tra,vec_d2),axis = 0)
y =  np.concatenate((np.zeros(vec_tra.shape[0] + vec_weibo.shape[0] + vec_wiki.shape[0]),np.ones(vec_d2.shape[0])),axis = 0)

print('before undersample:\t',sorted(Counter(y).items()))

X_resampled, y_resampled = rus.fit_resample(X, y)

print('after undersample:\t',sorted(Counter(y_resampled).items()))

y_resampled = y_resampled.reshape(-1,1)

before undersample:	 [(0.0, 727283), (1.0, 189782)]
after undersample:	 [(0.0, 189782), (1.0, 189782)]


In [139]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=test_size, random_state=seed)

d2_lr = LogisticRegression()
d2_lr = d2_lr.fit(X_train, y_train)

y_pred = d2_lr.predict(X_test)

# check the accuracy on the training set
d2_lr.score(X_test, y_test)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8495652937560376

In [158]:
s = '为什么，还要出现在我的面前。为什么，同意我呆在你身边。为什么，总是说些不清不楚的话。为什么，总是摆着一副好像对我一点也不讨厌的态度。为什么，没有来听我的钢琴。为什么，不能接受我啊'
cal_score(s,d2_lr,dictionary,w2v_model)

0.4

In [159]:
s = '桥都麻袋,这样子讲话有什么错吗?吶,告诉我啊。搜噶,你们已经不喜欢了啊…真是冷酷的人呢,果咩纳塞,让你看到不愉快的东西了。像我这样的人,果然消失就好了呢。也许只有在二次元的世界里,才有真正的美好存在的吧,吶?'
cal_score(s,d2_lr,dictionary,w2v_model)

0.46153846153846156

# 口语化分析

In [142]:
rus = RandomUnderSampler(random_state=0)

X = np.concatenate((vec_wiki,vec_tra,vec_d2,vec_weibo),axis = 0)
y =  np.concatenate((np.zeros(vec_tra.shape[0] + vec_d2.shape[0] + vec_wiki.shape[0]),np.ones(vec_weibo.shape[0])),axis = 0)

print('before undersample:\t',sorted(Counter(y).items()))

X_resampled, y_resampled = rus.fit_resample(X, y)

print('after undersample:\t',sorted(Counter(y_resampled).items()))

y_resampled = y_resampled.reshape(-1,1)

before undersample:	 [(0.0, 733457), (1.0, 183608)]
after undersample:	 [(0.0, 183608), (1.0, 183608)]


In [143]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=test_size, random_state=seed)

oral_lr = LogisticRegression()
oral_lr = oral_lr.fit(X_train, y_train)

y_pred = oral_lr.predict(X_test)

# check the accuracy on the training set
oral_lr.score(X_test, y_test)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8490782459441171

In [160]:
s = '为什么，还要出现在我的面前。为什么，同意我呆在你身边。为什么，总是说些不清不楚的话。为什么，总是摆着一副好像对我一点也不讨厌的态度。为什么，没有来听我的钢琴。为什么，不能接受我啊'
cal_score(s,oral_lr,dictionary,w2v_model)

0.0

## 学术性分析

In [145]:
rus = RandomUnderSampler(random_state=0)

X = np.concatenate((vec_weibo,vec_tra,vec_d2,vec_wiki),axis = 0)
y =  np.concatenate((np.zeros(vec_tra.shape[0] + vec_d2.shape[0] + vec_weibo.shape[0]),np.ones(vec_wiki.shape[0])),axis = 0)

print('before undersample:\t',sorted(Counter(y).items()))

X_resampled, y_resampled = rus.fit_resample(X, y)

print('after undersample:\t',sorted(Counter(y_resampled).items()))

y_resampled = y_resampled.reshape(-1,1)

before undersample:	 [(0.0, 813406), (1.0, 103659)]
after undersample:	 [(0.0, 103659), (1.0, 103659)]


In [146]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=test_size, random_state=seed)

sch_lr = LogisticRegression()
sch_lr = sch_lr.fit(X_train, y_train)

y_pred = sch_lr.predict(X_test)

# check the accuracy on the training set
sch_lr.score(X_test, y_test)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9341664839581963

In [161]:
s = '自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。自然语言处理是一门融语言学、计算机科学、数学于一体的科学。因此，这一领域的研究将涉及自然语言，即人们日常使用的语言，所以它与语言学的研究有着密切的联系，但又有重要的区别。自然语言处理并不是一般地研究自然语言，而在于研制能有效地实现自然语言通信的计算机系统，特别是其中的软件系统。因而它是计算机科学的一部分。'
cal_score(s,sch_lr,dictionary,w2v_model)

1.0

## 文学性分析

In [148]:
rus = RandomUnderSampler(random_state=0)

X = np.concatenate((vec_weibo,vec_d2,vec_wiki,vec_tra),axis = 0)
y =  np.concatenate((np.zeros(vec_wiki.shape[0] + vec_d2.shape[0] + vec_weibo.shape[0]),np.ones(vec_tra.shape[0])),axis = 0)

print('before undersample:\t',sorted(Counter(y).items()))

X_resampled, y_resampled = rus.fit_resample(X, y)

print('after undersample:\t',sorted(Counter(y_resampled).items()))

y_resampled = y_resampled.reshape(-1,1)

before undersample:	 [(0.0, 477049), (1.0, 440016)]
after undersample:	 [(0.0, 440016), (1.0, 440016)]


In [149]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=test_size, random_state=seed)

lite_lr = LogisticRegression()
lite_lr = lite_lr.fit(X_train, y_train)

y_pred = lite_lr.predict(X_test)

# check the accuracy on the training set
lite_lr.score(X_test, y_test)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8539001621839393

In [162]:
s = '不到两分钟，甚至更短，他已将全部烦恼给忘记了。就像大人们的烦恼也是烦恼一样，他忘记烦恼并不是因为他的烦恼对他不怎么沉重和难受，而是因为一种新的、更强烈的兴趣暂时压倒并驱散了他心中的烦闷——就像大人们在新奇感受的兴奋之时，也会暂时忘却自己的不幸一样。这种新产生的兴趣就是一种新的吹口哨方法，它很有价值，是刚从一个黑人那学到的，现在他正要一心练习练习又不想被别人打扰。这声音很特别，像小鸟的叫声，一种流畅而委婉的音调。在吹这个调子的时候，舌头断断续续地抵住口腔的上腭——读者若曾经也是孩子的话，也许还记得该怎样吹这种口哨。汤姆学得很勤奋，练得很专心，很快就掌握了其中要领。于是他沿街大步流星地走着，口中吹着口哨，心里乐滋滋的，那股乐劲如同天文学家发现了新行星时一般，仅就乐的程度之深之强烈而言，此时的汤姆绝对比天文学家还要兴奋。'
cal_score(s,lite_lr,dictionary,w2v_model)

0.5769230769230769

# 题目文本风格分析

In [194]:
import pandas as pd

df = pd.read_csv(os.path.join(DATADIR, 'question.csv'), nrows=100000)

df

Unnamed: 0,questionId,questionTitle,content,difficulty,totalAccepted,totalSubmission,acRate,categoryTitle,likes,translatedTitle,translatedContent
0,1,Two Sum,"Given an array of integers, return indices of ...",Easy,1179287,2411098,48.9%,Algorithms,8576,两数之和,给定一个整数数组 nums 和一个目标值 target，请你在该数组中找出和为目标值的那 两...
1,2,Add Two Numbers,You are given two non-empty linked lists repre...,Medium,467429,1239593,37.7%,Algorithms,4544,两数相加,给出两个 非空 的链表用来表示两个非负的整数。其中，它们各自的位数是按照 逆序 的方式存储的...
2,3,Longest Substring Without Repeating Characters,"Given a string, find the length of the longest...",Medium,551133,1576459,35.0%,Algorithms,3918,无重复字符的最长子串,"给定一个字符串，请你找出其中不含有重复字符的 最长子串 的长度。\n示例 1:\n输入: ""..."
3,4,Median of Two Sorted Arrays,There are two sorted arrays nums1 and nums2 of...,Hard,218703,570697,38.3%,Algorithms,2857,寻找两个正序数组的中位数,给定两个大小为 m 和 n 的正序（从小到大）数组 nums1 和 nums2。\n请你找出...
4,5,Longest Palindromic Substring,"Given a string s, find the longest palindromic...",Medium,308517,996465,31.0%,Algorithms,2388,最长回文子串,给定一个字符串 s，找到 s 中最长的回文子串。你可以假设 s 的最大长度为 1000。\n...
...,...,...,...,...,...,...,...,...,...,...,...
1152,1622,Max Value of Equation,Given an array points containing the coordinat...,Hard,993,2755,36.0%,Algorithms,7,满足不等式的最大值,给你一个数组 points 和一个整数 k 。数组中每个元素都表示二维平面上的点的坐标，并按...
1153,1626,Can Make Arithmetic Progression From Sequence,Given an array of numbers arr. A sequence of n...,Easy,4876,5818,83.8%,Algorithms,0,判断能否形成等差数列,给你一个数字数组 arr 。\n如果一个数列中，任意相邻两项的差总等于同一个常数，那么这个数...
1154,1627,Last Moment Before All Ants Fall Out of a Plank,We have a wooden plank of the length n units. ...,Medium,3405,7526,45.2%,Algorithms,8,所有蚂蚁掉下来前的最后一刻,有一块木板，长度为 n 个 单位 。一些蚂蚁在木板上移动，每只蚂蚁都以 每秒一个单位 的速度...
1155,1628,Count Submatrices With All Ones,Given a rows * columns matrix mat of ones and ...,Medium,2031,4409,46.1%,Algorithms,18,统计全 1 子矩形,给你一个只包含 0 和 1 的 rows * columns 矩阵 mat ，请你返回有多少...


In [164]:
texts = df.translatedContent

In [167]:
lite_score = [0.1 if cal_tra_score(q,lite_lr,dictionary,w2v_model)<0.1 else cal_tra_score(q,lite_lr,dictionary,w2v_model) for q in texts]
sch_score = [0.1 if cal_tra_score(q,sch_lr,dictionary,w2v_model)<0.1 else cal_tra_score(q,sch_lr,dictionary,w2v_model) for q in texts]
d2_score = [0.1 if cal_tra_score(q,d2_lr,dictionary,w2v_model)<0.1 else cal_tra_score(q,d2_lr,dictionary,w2v_model) for q in texts]
oral_score = [0.1 if cal_tra_score(q,oral_lr,dictionary,w2v_model)<0.1 else cal_tra_score(q,oral_lr,dictionary,w2v_model) for q in texts]

In [180]:
test_dict = {'questionId':df.questionId,'lite_score':lite_score,'sch_score':sch_score,'d2_score':d2_score,'oral_score':oral_score}
result_df = pd.DataFrame(data=test_dict)

In [181]:
result_df

Unnamed: 0,questionId,lite_score,sch_score,d2_score,oral_score
0,1,0.200000,0.900000,0.500000,0.200000
1,2,0.142857,0.857143,0.571429,0.142857
2,3,0.208333,0.875000,0.541667,0.100000
3,4,0.500000,0.750000,0.250000,0.125000
4,5,0.111111,0.777778,0.666667,0.333333
...,...,...,...,...,...
1152,1622,0.161290,0.677419,0.129032,0.419355
1153,1626,0.142857,0.785714,0.428571,0.100000
1154,1627,0.183673,0.530612,0.448980,0.142857
1155,1628,0.200000,0.500000,0.400000,0.100000


In [184]:
result_df.to_csv(os.path.join(DATADIR, 'leetcode_result.csv'),index=0)