In [35]:
import pandas as pd

prefix = 'v2/'

train_df = pd.read_csv(prefix + 'training_v2.tsv',  sep='\t', header=None)
train_df = train_df[1:]

eval_df = pd.read_csv(prefix + 'dev_v2.tsv', sep='\t', header=None)
eval_df = eval_df[1:]

train_df = pd.DataFrame({
    'text': train_df[3].replace(r'\n', ' ', regex=True),
    'label':train_df[5]
})

train_df["label"] = train_df["label"].apply(lambda x: int(x))

eval_df = pd.DataFrame({
    'text': eval_df[3].replace(r'\n', ' ', regex=True),
    'label':eval_df[5]
})

eval_df["label"] = eval_df["label"].apply(lambda x: int(x))

X_train = train_df["text"].to_list()
Y_train = train_df["label"].to_list()

X_test = eval_df["text"].to_list()
Y_test = eval_df["label"].to_list()

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(672, 4818)

In [37]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(672, 4818)

In [38]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

In [39]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])
text_clf = text_clf.fit(X_train, Y_train)

In [47]:
import numpy as np
import sklearn
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(X_test)
sklearn.metrics.f1_score(Y_test, predicted)

0.0

In [46]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge',
                                                   penalty='l2',
                                                   alpha=1e-3,
                                                   max_iter=100000,
                                                   random_state=42)),
                        ])

_ = text_clf_svm.fit(X_train, Y_train)
predicted_svm = text_clf_svm.predict(X_test)
sklearn.metrics.f1_score(Y_test, predicted_svm)

0.5904761904761905

In [50]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, Y_train)
gs_clf_svm.best_score_
gs_clf_svm.best_params_

0.6949143173023771


{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [51]:
predicted_svm = gs_clf_svm.predict(X_test)
sklearn.metrics.f1_score(Y_test, predicted_svm)

0.5523809523809524

In [55]:
import nltk
nltk.download()
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_svm_stemmed = Pipeline([('vect', stemmed_count_vect),
                      ('tfidf', TfidfTransformer()),
                     ('clf-svm', SGDClassifier(loss='hinge',
                                                   penalty='l2',
                                                   alpha=1e-3,
                                                   max_iter=100000,
                                                   random_state=42)),
])

text_svm_stemmed = text_svm_stemmed.fit(X_train, Y_train)

predicted_svm_stemmed = text_svm_stemmed.predict(X_test)
sklearn.metrics.f1_score(Y_test, predicted_svm_stemmed)


ModuleNotFoundError: No module named 'nltk'