# code

Analyze and process text data in CSV files, with the following ideas:
1. Using the Pandas to read in CSV files as dataframes
2. Convert dataframe to Python object
3. Write a processing function (such as data manipulation function and word segmentation)
4. Call the processing function here to obtain new data

In [13]:
import pandas as pd
from harvesttext import HarvestText
import pyhanlp
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

def csv_read_by_id(uid):
    path = "D://weibo//" + uid + "//" + uid + ".csv"#打开该用户文件夹并提取同名csv文件
    df = pd.read_csv(path, usecols=["正文"], dtype={"id": str})# 只要这一列,读取为字符串
    #print(df)
    contend = []
    for i in df["正文"]:
        i = clean_txt(i)
        contend.append(str(i)+'\n')
    #print('contend',contend)
    return contend

def is_chinese(uchar):
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':  # 判断一个uchar是否是汉字
        return True
    else:
        return False

def only_chinese(contents):
    content = ''
    for i in contents:
        if is_chinese(i):
            content = content+i
    return content

def clean_txt(s):
    # 清洗数据，只保留汉字
    ht = HarvestText()
    CharTable = pyhanlp.JClass('com.hankcs.hanlp.dictionary.other.CharTable')
    content = CharTable.convert(str(s))
    cleaned_content = ht.clean_text(content, emoji=True,weibo_topic=True)  # 去除链接、@xxx、email、话题、表情
    final = only_chinese(cleaned_content)
    return final

def write_list_in_txt(fn,content):
    file_handle = open(fn, 'w', encoding='utf-8-sig')
    file_handle.writelines(content)
    file_handle.close()
    
def write_byte_in_txt(fn,content):
    file_handle = open(fn,"wb")
    file_handle.write(content.encode('utf-8'))
    file_handle.close()  
               
def cut_stopwords(tokens):
    stop_word = ''
    stop_words = open(r'D:\corpus\stopwords.txt',"rb").read()
    for i in stop_words:
        if i=='\n':
            stop_word = stop_word+i
    tokens = [t for token in tokens for t in token if t != '' and t not in stop_word]
    return tokens

In [14]:
import pandas as pd
import jieba # 使用jieba分词
def reduce_nan(uid):
    '''删除csv正文中的空字符串'''
    file_name = "D://weibo//" + uid + "//" + uid + ".csv"#打开该用户文件夹并提取同名csv文件
    #file = pd.read_csv(file_name, usecols=["正文"], dtype={"id": str})# 只要这一列,读取为字符串
    list_line = []
    a = 0
    with open(file_name,'r',encoding = 'utf8') as file:
        for i in file.readlines():
                s = i.split(',')
                if s[2] != '正文':
                    list_line.append(s[2].rstrip()+'\n')
    return list_line

def count_weibo(file_name):
    count = 0
    with open(file_name,'r',encoding = 'utf8') as file:
        for i in file.readlines():
                count = count+1
    return count

# 预处理：读取，分词
#path = r'D:\weibo\users.csv'
#data = pd.read_csv(path, usecols=["用户id"], dtype={"id": str})# 只要这一列,读取为字符串类型

#counting = 0
for n in data["用户id"]:

    i = str(n)
    
    filename = "D://corpus//" + i + ".txt"
    output = "D://result//" + i + ".txt"
    txt_list = csv_read_by_id(i)

    #数据写入txt，一条微博为一行
    
    with open(filename,"rb") as f:
        for j in f.read():
            #print(j)
            #print(type(j))
           
            if j == 239:
                try:
                    txt_list = reduce_nan(i)
                    write_list_in_txt(filename,txt_list)  
                except:
                    print(i)
                    break  
            
    
    #counting += count_weibo(filename)
    
    
    # 读取到该用户的正文开始分词(这里没有启用并行模式的原因是windows系统不支持)
    word = " ".join(jieba.cut(content))# 精确模式（默认），分隔符为空格
    #去停用词(结合了中文停用词表、哈工大停用词表、四川大学机器智能实验室停用词库)
    #拿到了分词后的文件，在一般的NLP处理中，会需要去停用词。由于word2vec的算法依赖于上下文，而上下文有可能就是停词。因此对于word2vec，我们可以不用去停词。
    #words = " ".join(cut_stopwords(word))
    write_byte_in_txt(output,word)
    #training(output)
    
#print(counting)
print('OK')


OK
