In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

SEED = 2020
TRAIN_LEN = 180000
MAX_FEATURES = 3000

In [4]:
train_df = pd.read_csv('train_set.csv',sep='\t',nrows=200000)
test_df = pd.read_csv('test_a.csv',sep='\t')
sub_df = pd.read_csv('test_a_sample_submit.csv',sep='\t')

In [5]:
tfidf = TfidfVectorizer(max_features=3000) # ngram_range=(1,3), 
train_val = tfidf.fit_transform(train_df['text'])

In [6]:
clf_tfidf = LogisticRegression(penalty='l2',solver='sag',class_weight='balanced',multi_class='multinomial',n_jobs=-1)
clf_tfidf.fit(train_val[:TRAIN_LEN], train_df['label'].values[:TRAIN_LEN])



LogisticRegression(class_weight='balanced', multi_class='multinomial',
                   n_jobs=-1, solver='sag')

In [7]:
val_pred_tfidf = clf_tfidf.predict(train_val[TRAIN_LEN:])
print(f1_score(train_df['label'].values[TRAIN_LEN:], val_pred_tfidf, average='macro'))

0.8505738412407676


In [9]:
print(classification_report(train_df['label'].values[TRAIN_LEN:], val_pred_tfidf)) 

              precision    recall  f1-score   support

           0       0.94      0.82      0.87     37222
           1       0.92      0.87      0.89     35272
           2       0.98      0.94      0.96     30006
           3       0.91      0.91      0.91     21150
           4       0.77      0.89      0.83     14304
           5       0.78      0.86      0.82     11682
           6       0.89      0.91      0.90      9532
           7       0.77      0.86      0.81      8460
           8       0.81      0.88      0.84      7473
           9       0.78      0.90      0.84      5594
          10       0.83      0.88      0.85      4713
          11       0.78      0.88      0.83      2990
          12       0.77      0.94      0.84      1743
          13       0.58      0.91      0.71       859

    accuracy                           0.88    191000
   macro avg       0.82      0.89      0.85    191000
weighted avg       0.89      0.88      0.88    191000



In [None]:
scores = cross_val_score(clf_tfidf, train_val[TRAIN_LEN:] , train_df['label'].values[TRAIN_LEN:], cv=5)
scores.mean()

In [None]:
sub_df_X = tfidf.transform(test_df['text'])
pred_y = clf.predict(sub_df_X)
sub_df['label'] = pred_y