In [None]:
import numpy as np
import pandas as pd
import datatable as dt
import random, time, gc, copy

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

from tqdm import tqdm

import fasttext

In [None]:
train_df = dt.fread('../DataSet/train_set.csv', sep='\t').to_pandas()
test_df = dt.fread('../DataSet/test_a.csv', sep='\t').to_pandas()
train_df['label_ft'] = '__label__' + train_df['label'].astype(str)
train_df[['text', 'label_ft']].to_csv('../DataSet/FT_train.csv', index=None, sep='\t')

In [None]:
def my_gridsearch_cv(df, param_grid, kfold=10):
    
    skf=StratifiedKFold(n_splits=kfold, shuffle=True, random_state=42)

    params_combination = get_gridsearch_params(param_grid) # 获取参数的各种排列组合

    best_score = 0.0
    best_params = dict()
    for params in tqdm(params_combination):
        avg_score = get_KFold_scores(df, params, skf)
        if avg_score > best_score:
            best_score = avg_score
            best_params = copy.deepcopy(params)

    return best_score, best_params

In [None]:
def get_gridsearch_params(param_grid):
    params_combination = [dict()]  # 用于存放所有可能的参数组合
    for k, v_list in param_grid.items():
        tmp = [{k: v} for v in v_list]
        n = len(params_combination)
        # params_combination = params_combination*len(tmp)  # 浅拷贝，有问题
        copy_params = [copy.deepcopy(params_combination) for _ in range(len(tmp))] 
        params_combination = sum(copy_params, [])
        _ = [params_combination[i*n+k].update(tmp[i]) for k in range(n) for i in range(len(tmp))]
    return params_combination

In [None]:
def get_KFold_scores(train_df, params, skf):
    scores = []
    for train_index, test_index in skf.split(train_df['text'], train_df['label_ft']):
        train_df[['text', 'label_ft']].iloc[train_index].to_csv('../DataSet/FT_train.csv', index=None, sep='\t')
        model = fasttext.train_supervised('../DataSet/FT_train.csv', **params)
        # model.save_model('fasttext_model.pkl')
        # model_path = 'fastText_model.pkl'
        # model= fasttext.load_model(model_path)
        val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[test_index]['text']]
        score = f1_score(train_df['label'].values[test_index].astype(str), val_pred, average='macro')
        print(score, params)
        scores.append(score)
    print('mean score: ', np.mean(scores))
    return np.mean(scores)

In [None]:
tuned_parameters = {
    'lr': [0.5, 0.1, 0.05],
    'wordNgrams': [1, 2, 3],
    'epoch': [20, 25, 30],
    'dim': [50, 100, 150],
    'loss': ['hs'],
    'minCount': [1],
    'verbose': [2]
}

In [None]:
best_score, best_params = my_gridsearch_cv(train_df, tuned_parameters, 10)

In [None]:
print(best_params)
print(best_score)