In [13]:
#encoding=utf-8
import pandas as pd
import numpy as np
import re
import string

from textblob import TextBlob
from textblob.translate import NotTranslated
from multiprocessing import Pool
from itertools import repeat

import jieba
jieba.load_userdict("../WordCut/userdict.txt")

import gc

model_list = ['CutDebt','IDClassifier','IfKnowDebtor','Installment','SetDueDay','WillingToPay']

In [14]:
# this part combine the data which can be used together.
for each_model in model_list:
    path = '../../data/{}/'
    data = pd.read_csv(path.format(each_model) + 'mock_up_data1.csv', encoding='utf8')
    data.to_csv(path.format(each_model) + 'combined_mock_up_data.csv', index = False, encoding = 'utf8')

# label 0 part for CutDebt and Installment
data_cut = pd.read_csv(path.format('CutDebt') + 'combined_mock_up_data.csv', encoding = 'utf8')
data_ins = pd.read_csv(path.format('Installment') + 'combined_mock_up_data.csv', encoding = 'utf8')
temp_cut = data_cut[data_cut.label == 0]
temp_ins = data_ins[data_ins.label == 0]
data_cut = pd.concat([data_cut,temp_ins], ignore_index=True)
data_ins = pd.concat([data_ins,temp_cut], ignore_index=True)
data_cut.to_csv(path.format('CutDebt') + 'combined_mock_up_data.csv', index = False, encoding = 'utf8')
data_cut.to_csv(path.format('Installment') + 'combined_mock_up_data.csv', index = False, encoding = 'utf8')

del data_cut
del data_ins
gc.collect()

7365

In [15]:
def translate(comment, from_lang, to_lang):
        if hasattr(comment, "decode"):
            comment = comment.decode("utf-8")

        text = TextBlob(comment)
        try:
            text = text.translate(to=to_lang)
            text = text.translate(to=from_lang)
        except NotTranslated:
            pass

        return str(text)

def translate_csv(df,col,from_lang,to_lang,num_pol=10):
    """
        https://developers.google.com/translate/v2/using_rest#language-params
    """

    df = df.copy()

    if num_pol >= 1000:
        num_pol=1000
    comment_pool = df[col].values
    p = Pool(num_pol)
    new_col_name = col + '_' + to_lang
    df[new_col_name] = p.starmap(translate, zip(comment_pool, repeat(from_lang),repeat(to_lang)))
    df = df.drop([col], axis = 1)
    df = df.rename(index=str, columns={new_col_name:col})
    return df

def cut_words(text):
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text

def clean_label(label):
    return int(label)

In [16]:
for each_model in model_list:
    print(each_model)
    path = '../../data/{}/'
    data = pd.read_csv(path.format(each_model) + 'combined_mock_up_data.csv', encoding='utf8')
    data = data.dropna()
    col = 'split_text'
    print('finish loading')
    
    # translate and get more data
    data_en = translate_csv(data,col,from_lang='zh',to_lang='en',num_pol=50)
    print('finish 1st trans')
    data_fr = translate_csv(data,col,from_lang='zh',to_lang='fr',num_pol=50)
    print('finish 2nd trans')
    data_th = translate_csv(data,col,from_lang='zh',to_lang='th',num_pol=50)
    print('finish 3rd trans')
    data_lo = translate_csv(data,col,from_lang='zh',to_lang='lo',num_pol=50)
    print('finish 4th trans')
    data = pd.concat([data,data_en,data_fr,data_th,data_lo], ignore_index=True)
#     data = pd.concat([data,data_en,data_fr], ignore_index=True)
    
    # cut words
    data['split_text']=data['split_text'].apply(cut_words)
    print('finish cutting words')
    
    # cleaning and save
    data['split_text'] = data['split_text'].apply(clean)
    data['label'] = data['label'].apply(clean_label)

    # shuffle data
    data = data.sample(frac=1).reset_index(drop=True)
    print('finish shuffling')
    data.to_csv(path.format(each_model) + 'cleaned_mock_up_data.csv', index = False, encoding = 'utf8')

CutDebt
finish loading
finish 1st trans
finish 2nd trans
finish 3rd trans
finish 4th trans
finish cutting words
finish shuffling
IDClassifier
finish loading
finish 1st trans
finish 2nd trans
finish 3rd trans
finish 4th trans
finish cutting words
finish shuffling
IfKnowDebtor
finish loading
finish 1st trans
finish 2nd trans
finish 3rd trans
finish 4th trans
finish cutting words
finish shuffling
Installment
finish loading
finish 1st trans
finish 2nd trans
finish 3rd trans
finish 4th trans
finish cutting words
finish shuffling
SetDueDay
finish loading
finish 1st trans
finish 2nd trans
finish 3rd trans
finish 4th trans
finish cutting words
finish shuffling
WillingToPay
finish loading
finish 1st trans
finish 2nd trans
finish 3rd trans
finish 4th trans
finish cutting words
finish shuffling
