In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import numpy as np
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier

In [None]:
train_path = 'data/train.csv'
val_path = 'data/val.csv'
test_path = 'data/test.csv'
SEED = 42

In [None]:
df_train = pd.read_csv(train_path)
df_train.head()

In [None]:
df_val = pd.read_csv(val_path)
df_val.head()

In [None]:
X_train, y_train = df_train['text'], df_train['label']
X_val, y_val = df_val['text'], df_val['label']

In [None]:
stemmer = SnowballStemmer('english', ignore_stopwords=True)

In [None]:
class StemmedCV(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCV, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [None]:
stemmed_cv = StemmedCV(stop_words='english')

In [None]:
xgb = XGBClassifier(max_depth=25, min_child_weight=1,
                    booster='gbtree', n_jobs=6, seed=SEED)

In [None]:
pipeline = Pipeline([
    ('vect', stemmed_cv),
    ('tfidf', TfidfTransformer()),
    ('xgb', xgb)
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_val)

In [None]:
precision = precision_score(y_val, y_pred, average='micro')
recall = recall_score(y_val, y_pred, average='micro')
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='macro')

In [None]:
print(accuracy)
print(precision)
print(recall)
print(f1)

In [None]:
df_test = pd.read_csv(test_path)
df_test.head()

In [None]:
X_test = df_test['text']

In [None]:
y_test = pipeline.predict(X_test)

In [None]:
submission = pd.DataFrame(y_test)
submission.to_csv('data/submission.csv', header=['label'], index=False)