In [23]:
import numpy as np
import pandas as pd
import nltk
from sklearn.utils import shuffle
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC

In [4]:
train = pd.read_csv('products_sentiment_train.tsv', names=['text', 'label'], header=0, sep='\t')

In [6]:
test = pd.read_csv('products_sentiment_test.tsv', header=0, sep='\t').drop('Id', axis=1)

In [8]:
train = shuffle(train.append(train.iloc[list(train[train['label'] == 0].index[:500])]), random_state=10)

In [10]:
X = train['text'].values
y = train['label'].values

In [11]:
X_test = test['text'].values

In [24]:
class DimensionTransformer(TransformerMixin):
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [13]:
vectorizer_r = CountVectorizer(ngram_range=(1, 5))
X_r = vectorizer_r.fit_transform(X)
tfidf_r = TfidfTransformer()
freq_r = tfidf_r.fit_transform(X_r)
pca = PCA(n_components=2500, svd_solver='full')
pca.fit(pd.DataFrame(freq_r.A, columns=vectorizer_r.get_feature_names()))
reduced_pd = pca.transform(pd.DataFrame(freq_r.A, columns=vectorizer_r.get_feature_names()))

In [14]:
score = cross_val_score(LinearSVC(max_iter=400, loss='hinge', C=1.9, tol=0.001, random_state=777),
                      reduced_pd, y, scoring='accuracy', cv=5).mean()
print(f"LinearSVC + PCA - {score}")

LinearSVC + PCA - 0.9106327212020033


In [25]:

svc_pca = Pipeline([
            ('vectorizer', CountVectorizer(min_df=1, ngram_range=(1, 5), max_df=0.9, stop_words=None)),
            ('transformer', TfidfTransformer()),
            ('dense', DimensionTransformer()),
            ('pca', PCA(n_components=2500, svd_solver='full')),
            ('classifier', LinearSVC(max_iter=400, loss='hinge', C=1.9, tol=0.001, random_state=777))
        ])

In [26]:
svc_pca.fit(X, y)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.9,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 5), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                 <__main__.DimensionTransformer object at 0x1a23c7eb10>),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=2500,
                     random_state=None, svd_solver='full', tol=0.0,
                     whiten=False)),
                ('classifier',
                 LinearSVC(C

In [27]:
svc_pca_pred = svc_pca.predict(X_test)

In [21]:
with open('svc_pca.csv', 'w') as f:
    f.write(pd.DataFrame(pd.Series(map(str, range(0, 500))).str.cat(map(str, svc_pca_pred), sep=','), 
                                 columns=['Id,y']).to_csv(sep=' ', index=False))

![](KaggleScreen.png)