In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import datatable as dt

In [3]:
train_df = dt.fread('/home/liuchh/kaggle/input/train_set.csv', sep='\t').to_pandas()
test_df = dt.fread('/home/liuchh/kaggle/input/test_a.csv', sep='\t').to_pandas()

In [None]:
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    use_idf=True,
    smooth_idf=True, 
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1,3),
    max_features=3000)
tfidf.fit(np.concatenate((train_df['text'].iloc[:].values,test_df['text'].iloc[:].values),axis=0))
train_word_features = tfidf.transform(train_df['text'].iloc[:].values)
test_word_features = tfidf.transform(test_df['text'].iloc[:].values)

In [None]:
X_train = train_word_features
y_train = train_df['label']
X_test = test_word_features

In [None]:
KF = KFold(n_splits=5, random_state=7) 

clf = LGBMClassifier(n_jobs=-1, feature_fraction=0.7, bagging_fraction=0.4, lambda_l1=0.001, lambda_l2=0.01, n_estimators=600)

# 存储测试集预测结果 行数：len(X_test) ,列数：1列
test_pred = np.zeros((X_test.shape[0], 1), int)  
for KF_index, (train_index,valid_index) in enumerate(KF.split(X_train)):
    print('第', KF_index+1, '折交叉验证开始...')
    # 训练集划分
    x_train_, x_valid_ = X_train[train_index], X_train[valid_index]
    y_train_, y_valid_ = y_train[train_index], y_train[valid_index]
    # 模型构建
    clf.fit(x_train_, y_train_)
    # 模型预测
    val_pred = clf.predict(x_valid_)
    print("准确率为：",f1_score(y_valid_, val_pred, average='macro'))
    
    # 保存测试集预测结果
    test_pred = np.column_stack((test_pred, clf.predict(X_test)))  # 将矩阵按列合并

In [None]:
preds = []
for i, test_list in enumerate(test_pred):
    preds.append(np.argmax(np.bincount(test_list)))
preds = np.array(preds)

In [None]:
submission = pd.read_csv('/test_a_sample_submit.csv')
submission['label'] = preds
submission.to_csv('../output/LGBMClassifier_submission.csv', index=False)