In [None]:
from nltk.corpus import movie_reviews as mr
from collections import defaultdict
import numpy as np
import pandas as pd

from sklearn import metrics
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
#from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

import nltk.corpus
from scipy import sparse
import matplotlib.pyplot as plt
from sklearn.base import TransformerMixin

from mlxtend.preprocessing import DenseTransformer as DT
import os
import xgboost as xgb
from sklearn.utils import shuffle
import csv

import warnings
warnings.filterwarnings('ignore')

In [15]:
#import nltk
#nltk.download()

vectorizer: CountVectorizer
                'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0], 'vectorizer__min_df' : [1, 10, 20, 30],
                'vectorizer__ngram_range' : [(1, 1), (1, 2)]
            TfidfVectorizer

classifier: LogisticRegression(), LinearSVC(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
            RidgeClassifier(), 
            sklearn.naive_bayes import MultinomialNB
            sklearn.naive_bayes import BernoulliNB(binarize=0.0)
            
metrics: precision, recall, f1, roc_auc, all together - metrics.classification_report(), mean            

#### get dataset for train and prediction 

In [2]:
train = pd.read_csv(".\data\products_sentiment_train.tsv", names = ["text", "label"], header = 0, sep="\t")
#train = pd.read_csv("products_sentiment_train.tsv", encoding="utf-8", names = ["text", "label"], header = 0, sep="\t")

train.head()

In [3]:
#python 3; python 2 - no quoting
test = pd.read_csv(".\data\products_sentiment_test_copy.tsv", header = 0, sep="\t", quoting=csv.QUOTE_NONE)
del test["Id"]

print(test.shape)
test.head()

In [15]:
print(train.iloc[[0]])
print(train.iloc[0, 0])
print(train.iloc[2, 0])
print(train.shape)
print(test.shape)

##### count amount of reviews with positive label 1

In [16]:
print(train[train["label"] == 1].sum())  
print(type(train["label"]))

##### classes are unbalanced - add 500 randomly selected repeated negative reviews into train dataset. This will have a positive effect on the quality of the predictive model.

In [4]:
zero_test_ind = train[train["label"] == 0]
new_train = shuffle(train.append(train.iloc[list(zero_test_ind.index[:500])]), random_state=10)

In [18]:
print(zero_test_ind.index[:500])
print(train.iloc[list(zero_test_ind.index[:500])].shape)

print(new_train.shape)
print(new_train.head())
print(" ")
print(new_train[new_train["label"] == 0].shape)
print(new_train[new_train["label"] == 1].shape)

#### initial dataset: reviews - train["text"], labels - train["label"] <br> new ballanced dataset: reviews - new_train["text"], labels - new_train["label"]

#### create token_matrix:  (document number, word number)  values of this matrix - how many times word appear in document; and tf-idf matrix

In [5]:
stop_words =  nltk.corpus.stopwords.words('english')

In [6]:
#token_counts = CountVectorizer(stop_words=stop_words)
cvect = CountVectorizer()
token_matrix = cvect.fit_transform(new_train["text"])
token_matrix_ = cvect.fit_transform(train["text"])

print(token_matrix.shape)

In [7]:
tfidf_transformer = TfidfTransformer()
frequency_counts = tfidf_transformer.fit_transform(token_matrix)
frequency_counts_ = tfidf_transformer.fit_transform(token_matrix_)

print(frequency_counts.shape)
print(frequency_counts_.shape)

In [28]:
print(np.shape(token_matrix))
print(np.shape(new_train["label"]))
print(np.shape(token_matrix_))
print(np.shape(train["label"]))

In [8]:
pd_token_matrix = pd.DataFrame(token_matrix.A, columns=cvect.get_feature_names())
pd_token_matrix_ = pd.DataFrame(token_matrix_.A, columns=cvect.get_feature_names())

print(pd_token_matrix.head())

In [67]:
print(pd.DataFrame(frequency_counts.A, columns=cvect.get_feature_names()).head())

#### functions for best parameters selection

In [9]:
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )

In [10]:
def estimator(classifier, parameters_grid, scorer, data, labels):
    pipeline = text_classifier(CountVectorizer(), TfidfTransformer(), classifier)
    
    grid_cv = GridSearchCV(pipeline, parameters_grid, scoring = scorer, cv = 4)
    grid_cv.fit(data, labels)
    
    return grid_cv

scorer in ['roc_auc', 'accuracy', 'average_precision', 'f1']

#### metrics - accuracy
#### select classifier among: <br> linear classifiers -  LogisticRegression, LinearSVC, SGDClassifier, RidgeClassifier <br> Bayes classifiers - MultinomialNB, BernoulliNB, GaussianNB <br> tree classifiers - DecisionTreeClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier <br> gradient boosting from sklearn library and XGBoost

In [29]:
not_shufled_train = train.append(train.ix[list(zero_test_ind.index[:500])])
print("SGDClassifier", " accuracy ", cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), SGDClassifier(random_state=1)), 
                                not_shufled_train["text"], not_shufled_train["label"], scoring='accuracy').mean())
print("SGDClassifier shuffled ", " accuracy ", cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), SGDClassifier(random_state=1)), 
                                new_train["text"], new_train["label"], scoring='accuracy').mean())

##### use shuffled data

In [26]:
%%time
for k, clf in {"LogisticRegression": LogisticRegression, "LinearSVC": LinearSVC, 
               "SGDClassifier": SGDClassifier, "RidgeClassifier": RidgeClassifier}.items():
    
    print("initial dataset: ")
    print(k, " accuracy ", cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf(random_state=1)), 
                                             train["text"], train["label"], scoring='accuracy').mean())
    print("ballanced dataset: ")
    print(k, " accuracy ", cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf(random_state=1)), 
                                new_train["text"], new_train["label"], scoring='accuracy').mean())

In [2]:
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [28]:
%%time
for k, clf in {"MultinomialNB": MultinomialNB, "BernoulliNB": BernoulliNB}.items():
    print("initial dataset: ")
    print(k, ' accuracy ', cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf()), 
                                             train["text"], train["label"], scoring= 'accuracy').mean())
    print("ballanced dataset: ")
    print(k, ' accuracy ', cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf()), 
                                             new_train["text"], new_train["label"], scoring= 'accuracy').mean())

In [31]:
%%time
print("initial dataset, GaussianNB classifier accuracy: ", 
      cross_val_score(Pipeline([("vectorizer", CountVectorizer()), ("transformer", TfidfTransformer()),
                                ('to_dense', DenseTransformer()), ("classifier", GaussianNB())]), 
                                train["text"], train["label"], scoring='accuracy').mean())
print("ballanced dataset, GaussianNB classifier accuracy: ", 
      cross_val_score(Pipeline([("vectorizer", CountVectorizer()), ("transformer", TfidfTransformer()),
                                ('to_dense', DenseTransformer()), ("classifier", GaussianNB())]), 
                                new_train["text"], new_train["label"], scoring='accuracy').mean())

#### ballanced dataset show better result

#### tune linear classificators parameters. choose the best one and after this -  tune CountVectorizer() parameters

### Linear classifiers

In [14]:
parameters_grid_vectorizer = {
    'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0],
    'vectorizer__min_df' : [1, 10, 20], 
    'vectorizer__ngram_range' : [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6)],
    'vectorizer__stop_words' : [stop_words, None, "english"]
}

In [15]:
parameters_grid_lr = {
    'classifier__C' : [0.8,   1   , 1.2, 1.4], 
    'classifier__max_iter' : [60, 80,    100   , 120], 
    'classifier__solver' : ['lbfgs',    'liblinear'   , 'sag'], 
}
parameters_grid_sgdc = {
    'classifier__loss' : ["log",    "hinge"   , "modified_huber"], 
    'classifier__penalty' :  ["l1",    "l2"   , "elasticnet"], 
    'classifier__n_iter' : [4,   5,   6, 8, 10], 
}
parameters_grid_lsvc = {
    'classifier__loss' : ["hinge", "squared_hinge"], 
    'classifier__max_iter' : [400, 500, 600, 800, 1000],
    'classifier__tol' : [1e-5, 1e-4, 1e-3], 
    'classifier__C' : [0.9, 1.0, 1.1, 1.2], 
}
#'classifier__penalty' : ["l1", "l2"],

parameters_grid_rc = {
    'classifier__alpha' : [0.6, 0.8, 1, 1.2, 2, 5],
    'classifier__normalize' : [True, False], 
    'classifier__tol' : [0.0001, 0.0005, 0.001, 0.0015, 0.002],
    'classifier__solver' : ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag"]
}

In [18]:
pipeline_ = Pipeline(steps = [("vectorizer", CountVectorizer()), ("transformer", TfidfTransformer()), ("classifier", LinearSVC())])
pipeline_.get_params().keys()

In [64]:
%%time
LR_grid_search = estimator(LogisticRegression(random_state=1), parameters_grid_lr, 'accuracy', new_train["text"], 
                           new_train["label"])

In [79]:
print("LogisticRegression accuracy: ")
#print LR_grid_search.grid_scores_
print(LR_grid_search.best_score_)
print(LR_grid_search.best_params_)

In [69]:
%%time
SGDC_grid_search = estimator(SGDClassifier(random_state=1), parameters_grid_sgdc, 'accuracy', new_train["text"], 
                             new_train["label"])

In [72]:
print("SGDClassifier accuracy: ")
#print SGDC_grid_search.grid_scores_
print(SGDC_grid_search.best_score_)
print(SGDC_grid_search.best_params_)

In [36]:
%%time
LSVC_grid_search_1 = estimator(LinearSVC(random_state=1), parameters_grid_lsvc, 'accuracy', new_train["text"], 
                               new_train["label"])

In [37]:
print("LinearSVC accuracy: ")
#print LSVC_grid_search.grid_scores_
print(LSVC_grid_search_1.best_score_)
print(LSVC_grid_search_1.best_params_)

In [85]:
%%time
RC_grid_search = estimator(RidgeClassifier(random_state=1), parameters_grid_rc, 'accuracy', new_train["text"], 
                               new_train["label"])

In [86]:
print("RidgeClassifier accuracy: ")
#print RC_grid_search.grid_scores_
print(RC_grid_search.best_score_)
print(RC_grid_search.best_params_)

#### best results with parameters by default showed SGDClassifier, RidgeClassifier and LinearSVC classifiers

#### tune parameters for CountVectorizer() for previously selected SGDClassifier, RidgeClassifier Рё LinearSVC

In [87]:
%%time
SGDC_grid_search_ = estimator(SGDClassifier(n_iter=6, loss='log', penalty='l2', random_state=1), 
                              parameters_grid_vectorizer, 'accuracy', new_train["text"], new_train["label"])

In [88]:
print("SGDClassifier_full accuracy: ")
#print SGDC_grid_search_.grid_scores_
print(SGDC_grid_search_.best_score_)
print(SGDC_grid_search_.best_params_)

In [89]:
%%time
RC_grid_search_ = estimator(RidgeClassifier(tol=0.0001, solver='auto', alpha=0.6, normalize=True, random_state=1), 
                            parameters_grid_vectorizer, 'accuracy', new_train["text"], new_train["label"])

In [90]:
print("RidgeClassifier_full accuracy: ")
#print RC_grid_search_.grid_scores_
print(RC_grid_search_.best_score_)
print(RC_grid_search_.best_params_)

In [91]:
%%time
LSVC_grid_search_ = estimator(LinearSVC(max_iter=600, loss='squared_hinge', random_state=1), 
                              parameters_grid_vectorizer, 'accuracy', new_train["text"], new_train["label"])

In [92]:
print("LinearSVC_full accuracy: ")
#print LSVC_grid_search_.grid_scores_
print(LSVC_grid_search_.best_score_)
print(LSVC_grid_search_.best_params_)

In [55]:
%%time
LSVC_grid_search__1 = estimator(LinearSVC(max_iter=400, loss='squared_hinge', C= 1.1, tol=1e-05, random_state=1), 
                             parameters_grid_vectorizer, 'accuracy', new_train["text"], new_train["label"])

In [56]:
print("LinearSVC__1full accuracy: ")
#print LSVC_grid_search__1.grid_scores_
print(LSVC_grid_search__1.best_score_)
print(LSVC_grid_search__1.best_params_)

### Best linear classifier - LinearSVC(max_iter=600, loss='squared_hinge', C= 1.1, tol=1e-05) with CountVectorizer(min_df=1, ngram_range=(1, 3), max_df=0.85, stop_words=None)   accuracy - 0.86 

### Desicion Tree classifiers 

In [25]:
tree_classifiers = [DecisionTreeClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier]

In [24]:
#pipeline__ = Pipeline(steps = [("classifier", GradientBoostingClassifier())])
#pipeline__.get_params().keys()

#### DecisionTreeClassifier
      ['classifier__min_impurity_split',  'classifier__max_leaf_nodes',  'classifier__max_features', 
      'classifier__min_samples_split',   'classifier__min_weight_fraction_leaf',    'classifier__splitter',
      'classifier__class_weight',   'steps',   'classifier__max_depth',   'classifier__min_samples_leaf',
      'classifier__presort',   'classifier',   'classifier__random_state',   'classifier__criterion']
#### RandomForestClassifier
      ['classifier__bootstrap',   'classifier__min_impurity_split',   'classifier__max_depth',   'classifier__max_features',
       'classifier__min_samples_split',   'classifier__min_samples_leaf',   'classifier__oob_score',
       'classifier__class_weight',   'classifier__n_estimators',   'classifier__max_leaf_nodes',   'classifier__random_state',
       'steps',   'classifier__warm_start',   'classifier__n_jobs',   'classifier__min_weight_fraction_leaf',   'classifier',
       'classifier__verbose',   'classifier__criterion']  
#### BaggingClassifier
      ['classifier__bootstrap',   'classifier__max_features',   'classifier__base_estimator',   'classifier__oob_score',
       'classifier__n_estimators',   'classifier__random_state',   'classifier__max_samples', 'classifier__bootstrap_features',
       'steps',   'classifier__warm_start',   'classifier__n_jobs',   'classifier',   'classifier__verbose']       
#### GradientBoostingClassifier
      ['classifier__min_impurity_split',   'classifier__max_features',   'classifier__subsample',   'classifier__max_depth',
       'classifier__alpha',   'classifier__min_samples_split',   'classifier__learning_rate',   'classifier__min_samples_leaf',
       'classifier__criterion',   'classifier__loss',   'classifier__n_estimators',   'classifier__max_leaf_nodes',
       'steps',   'classifier__warm_start',   'classifier__verbose',   'classifier__presort',   
       'classifier__min_weight_fraction_leaf',   'classifier',   'classifier__random_state',   'classifier__init']               

In [28]:
parameters_grid_dtc = {
    'classifier__max_depth' : range(6, 31, 4),
    'classifier__min_samples_split' : range(2, 4, 1),
    'classifier__min_samples_leaf' : [1, 2, 3],
    'classifier__min_impurity_split' : [10 ** (-10), 10 ** (-9), 10 ** (-8), 10 ** (-7)],
    
}

parameters_grid_rfc = {
    'classifier__n_estimators' : range(2, 32, 5),
    'classifier__max_depth' : [None] + list(range(2, 22, 4)),
    'classifier__min_samples_split' : range(2, 4, 1),
    'classifier__min_samples_leaf' : [1, 2, 3],
    'classifier__max_features' : ["auto", "sqrt", "log2"],
    'classifier__min_impurity_split' : [10 ** (-8), 10 ** (-7)],
}

parameters_grid_bc = {
    'classifier__n_estimators' : range(10, 100, 20),
    'classifier__warm_start' : [False, True], 
    'classifier__bootstrap_features' : [False, True], 
    
}


In [26]:
%%time
DTC_grid_search = estimator(DecisionTreeClassifier(random_state=1), parameters_grid_dtc, 'accuracy', 
                            new_train["text"], new_train["label"])

In [27]:
print("DecisionTreeClassifier accuracy: ")
#print DTC_grid_search.grid_scores_
print(DTC_grid_search.best_score_)
print(DTC_grid_search.best_params_)

In [28]:
%%time
RFC_grid_search = estimator(RandomForestClassifier(random_state=1), parameters_grid_rfc, 'accuracy', 
                            new_train["text"], new_train["label"])

In [29]:
print("RandomForestClassifier accuracy: ")
#print RFC_grid_search.grid_scores_
print(RFC_grid_search.best_score_)
print(RFC_grid_search.best_params_)

In [36]:
print(cross_val_score(
                      text_classifier(CountVectorizer(), 
                                      TfidfTransformer(), 
                                      RandomForestClassifier(n_estimators=32, min_samples_split=3, max_features='log2', 
                                                             min_impurity_split=1e-08, random_state=1)), 
                      new_train["text"], new_train["label"], scoring= 'accuracy', cv=4).mean())

In [30]:
%%time
BC_grid_search = estimator(BaggingClassifier(random_state=1), parameters_grid_bc, 'accuracy', 
                           new_train["text"], new_train["label"])

In [31]:
print("BaggingClassifier accuracy: ")
#print BC_grid_search.grid_scores_
print(BC_grid_search.best_score_)
print(BC_grid_search.best_params_)

#### best decision tree classifier - RandomForestClassifier Рё BaggingClassifier

#### tune parameters for CountVectorizer() for previously selected RandomForestClassifier Рё BaggingClassifier

In [24]:
%%time
RFC_grid_search_ = estimator(RandomForestClassifier(min_impurity_split=1e-08, max_features='log2', min_samples_split=3, 
                                                    n_estimators=27, min_samples_leaf=1, max_depth=None, random_state=1), 
                             parameters_grid_vectorizer, 'accuracy', new_train["text"], new_train["label"])

In [25]:
print("RandomForestClassifier full accuracy: ")
#print RFC_grid_search_.grid_scores_
print(RFC_grid_search_.best_score_)
print(RFC_grid_search_.best_params_)

In [26]:
%%time
BC_grid_search_ = estimator(BaggingClassifier(bootstrap_features=True, n_estimators=50, warm_start=False, random_state=1), 
                            parameters_grid_vectorizer, 'accuracy', new_train["text"], new_train["label"])

In [27]:
print("BaggingClassifier full accuracy: ")
#print BC_grid_search_.grid_scores_
print(BC_grid_search_.best_score_)
print(BC_grid_search_.best_params_)

#### tuning CountVectorizer() parameters led to an increase of  BaggingClassifier score and decrease of RandomForestClassifier score

### Best decision tree classifier - RandomForestClassifier(n_estimators=32, min_samples_split=3,  max_features='log2', min_impurity_split=1e-08, random_state=1); accuracy - 0.8467; CountVectorizer() with default parameters

#### conda install -c rasbt mlxtend

##### gradient boosting classifier

In [29]:
pipeline__ = Pipeline(steps = [("classifier", GradientBoostingClassifier())])
pipeline__.get_params().keys()

#### Initial values: <br> 1)min_samples_split = 25 :  ~0.5-1% from total value <br> 2)min_samples_leaf = 50 <br> 3)max_depth = 6 <br> 4)max_features = "sqrt" <br> 5)subsample = 0.8

In [30]:
param_test1 = {'classifier__n_estimators':range(20,120,10)}

In [34]:
%%time
grid_cv_gbr_1 = GridSearchCV( Pipeline([("vectorizer", CountVectorizer()), 
                                        ("transformer", TfidfTransformer()),
                                        ('to_dense', DT()), 
                                        ("classifier", GradientBoostingClassifier(learning_rate=0.1, min_samples_split=25,
                                                                                  min_samples_leaf=50, max_depth=6,
                                                                                  max_features='sqrt', subsample=0.8, 
                                                                                  random_state=1))]),
                              param_grid = param_test1,  scoring = 'accuracy', cv = 4)
grid_cv_gbr_1.fit(np.array(new_train["text"]), np.array(new_train["label"]))

In [35]:
print("all scores: ", grid_cv_gbr_1.grid_scores_)
print("best score: ", grid_cv_gbr_1.best_score_)
print("best params: ", grid_cv_gbr_1.best_params_)

In [39]:
param_test2 = {'classifier__max_depth':range(4,20,4), 'classifier__min_samples_split':range(10,50,10)}

In [40]:
%%time
grid_cv_gbr_2 = GridSearchCV( Pipeline([("vectorizer", CountVectorizer()), 
                                        ("transformer", TfidfTransformer()),
                                        ('to_dense', DT()), 
                                        ("classifier", GradientBoostingClassifier(learning_rate=0.1, n_estimators= 80,
                                                                                  max_features='sqrt',subsample=0.8,
                                                                                  random_state=1))]),
                              param_grid = param_test2, n_jobs=4, scoring = 'accuracy', cv = 4)
grid_cv_gbr_2.fit(np.array(new_train["text"]), np.array(new_train["label"]))

In [41]:
print("all scores: ", grid_cv_gbr_2.grid_scores_)
print("best score: ", grid_cv_gbr_2.best_score_)
print("best params: ", grid_cv_gbr_2.best_params_)

In [68]:
param_test3 = {'classifier__max_features':range(390,415,5)}

In [70]:
%%time
grid_cv_gbr_3 = GridSearchCV( Pipeline([("vectorizer", CountVectorizer()), 
                                        ("transformer", TfidfTransformer()),
                                        ('to_dense', DT()), 
                                        ("classifier", GradientBoostingClassifier(learning_rate=0.1, n_estimators= 80 ,
                                                                                  max_depth=16, min_samples_split=30,
                                                                                  subsample=0.8,random_state=1))]),
                              param_grid = param_test3, n_jobs=4, scoring = 'accuracy', cv = 4)
grid_cv_gbr_3.fit(np.array(new_train["text"]), np.array(new_train["label"]))

In [71]:
print("all scores: ", grid_cv_gbr_3.grid_scores_)
print("best score: ", grid_cv_gbr_3.best_score_)
print("best params: ", grid_cv_gbr_3.best_params_)

In [73]:
param_test4 = {'classifier__subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}

In [74]:
%%time
grid_cv_gbr_4 = GridSearchCV( Pipeline([("vectorizer", CountVectorizer()), 
                                        ("transformer", TfidfTransformer()),
                                        ('to_dense', DT()), 
                                        ("classifier", GradientBoostingClassifier(learning_rate=0.1, n_estimators= 80 ,
                                                                                  max_depth=16, min_samples_split=30,
                                                                                  max_features=390, random_state=1))]),
                              param_grid = param_test4, n_jobs=4, scoring = 'accuracy', cv = 4)
grid_cv_gbr_4.fit(np.array(new_train["text"]), np.array(new_train["label"]))

In [75]:
print("all scores: ", grid_cv_gbr_4.grid_scores_)
print("best score: ", grid_cv_gbr_4.best_score_)
print("best params: ", grid_cv_gbr_4.best_params_)

In [42]:
%%time
print("GradientBoostingClassifier learning_rate=0.05, n_estimators= 160", ' accuracy ', 
      cross_val_score(Pipeline( [("vectorizer", CountVectorizer()), 
                                 ("transformer", TfidfTransformer()),
                                 ('to_dense', DT()), 
                                 ("classifier", GradientBoostingClassifier(learning_rate=0.05, n_estimators= 160 ,
                                                                           max_depth=16, min_samples_split=30,max_features=390,
                                                                           subsample=0.8, random_state=1))] ), 
                      new_train["text"], new_train["label"], scoring= 'accuracy', cv = 4).mean())

#### best parameters set at this moment - GradientBoostingClassifier(learning_rate=0.1, n_estimators= 80, max_depth=16, min_samples_split=30, max_features=390, subsample=0.8, random_state=1)

#### start to tune vectorizer parameters

In [32]:
%%time
grid_cv_gbr_5 = GridSearchCV( Pipeline([("vectorizer", CountVectorizer()),
                                        ("transformer", TfidfTransformer()),
                                        ('to_dense', DT()), 
                                        ("classifier", GradientBoostingClassifier(learning_rate=0.1, n_estimators=80 ,
                                                                                  subsample=0.8, max_depth=16, 
                                                                                  min_samples_split=30,max_features=390, 
                                                                                  random_state=1))]),
                               param_grid =  {'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0]}, scoring = 'accuracy', cv = 4)
grid_cv_gbr_5.fit(np.array(new_train["text"]), np.array(new_train["label"]))

In [33]:
print("all scores: ", grid_cv_gbr_5.grid_scores_)
print("best score: ", grid_cv_gbr_5.best_score_)
print("best params: ", grid_cv_gbr_5.best_params_)

##### same result for max_df = 0.85, 0.9, 0.95 Рё 1

In [35]:
%%time
grid_cv_gbr_6 = GridSearchCV( Pipeline([("vectorizer", CountVectorizer()),
                                        ("transformer", TfidfTransformer()),
                                        ('to_dense', DT()), 
                                        ("classifier", GradientBoostingClassifier(learning_rate=0.1, n_estimators=80 ,
                                                                                  subsample=0.8, max_depth=16, 
                                                                                  min_samples_split=30, max_features='sqrt', 
                                                                                  random_state=1))]),
                              param_grid =  {'vectorizer__min_df' : [1, 10, 20]}, scoring = 'accuracy', cv = 4)
grid_cv_gbr_6.fit(np.array(new_train["text"]), np.array(new_train["label"]))

In [36]:
print("all scores: ", grid_cv_gbr_6.grid_scores_)
print("best score: ", grid_cv_gbr_6.best_score_)
print("best params: ", grid_cv_gbr_6.best_params_)

In [43]:
%%time
grid_cv_gbr_7 = GridSearchCV( Pipeline([("vectorizer", CountVectorizer()),
                                        ("transformer", TfidfTransformer()),
                                        ('to_dense', DT()), 
                                        ("classifier", GradientBoostingClassifier(learning_rate=0.1, n_estimators=80 ,
                                                                                  subsample=0.8, max_depth=16, 
                                                                                  min_samples_split=30, max_features='sqrt', 
                                                                                  random_state=1))]),
                              param_grid =  {'vectorizer__ngram_range' : [(1, 1), (1, 2), (1, 3)]}, 
                              scoring = 'accuracy', cv = 4, verbose=1)
grid_cv_gbr_7.fit(np.array(new_train["text"]), np.array(new_train["label"]))

In [74]:
#print grid_cv_gbr_7.grid_scores_
print("best score: ", grid_cv_gbr_7.best_score_)
print("best params: ", grid_cv_gbr_7.best_params_)

In [76]:
%%time
grid_cv_gbr_8 = GridSearchCV( Pipeline([("vectorizer", CountVectorizer()),
                                        ("transformer", TfidfTransformer()),
                                        ('to_dense', DT()), 
                                        ("classifier", GradientBoostingClassifier(learning_rate=0.1, n_estimators=80 ,
                                                                                  subsample=0.8, max_depth=16, 
                                                                                  min_samples_split=30, max_features='sqrt', 
                                                                                  random_state=1))]),
                              param_grid =  {'vectorizer__stop_words' : [stop_words, None, "english"]}, 
                              scoring = 'accuracy', cv = 4)
grid_cv_gbr_8.fit(np.array(new_train["text"]), np.array(new_train["label"]))

In [77]:
#print grid_cv_gbr_8.grid_scores_
print("best score: ", grid_cv_gbr_8.best_score_)
print("best params: ", grid_cv_gbr_8.best_params_)

#### Best result - GradientBoostingClassifier(learning_rate=0.1, n_estimators= 80, max_depth=16, min_samples_split=30, max_features=390, subsample=0.8, random_state=1)

### XGBoost

In [29]:
#mingw_path = 'C:\\mingw64\\bin'
#mingw_path1 ='C:\\mingw64\\x86_64-w64-mingw32\\bin'
#os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [12]:
parameters_grid_xgbc = {'classifier__n_estimators' : [1] + list(range(55, 200, 10))}

In [13]:
pipeline_ = Pipeline(steps = [("vectorizer", CountVectorizer()), 
                              ("transformer", TfidfTransformer()), 
                              ("classifier", xgb.XGBClassifier())])
pipeline_.get_params().keys()

##### tune n_estimators

In [21]:
xgb_scoring = defaultdict(dict)

In [22]:
n_trees = list(range(1, 250, 25)) 

In [23]:
%%time
xgb_scoring = {}
for n_tree in n_trees:
    estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=n_tree, min_child_weight=3, seed=0)
    score = cross_val_score(estimator, pd_token_matrix.as_matrix(), np.array(new_train["label"]), 
                            scoring = 'accuracy', n_jobs=-1, cv = 4).mean()    
    xgb_scoring[n_tree] = score
xgb_scoring = np.asmatrix(xgb_scoring)

In [24]:
print(xgb_scoring)

[[{176: 0.7611176271811256, 1: 0.6310629618251823, 226: 0.7615131502416645, 51: 0.7319022036536413, 101: 0.7455079913164577, 151: 0.7591169820594741, 201: 0.7615157102482183, 26: 0.6998893462767264, 76: 0.7427099021053494, 126: 0.7551080117965103}]]

##### best n_estimators: 176

##### tune max_depth

In [32]:
max_depth = range(4, 24, 2) 

In [33]:
xgb_scoring_1 = {}

In [34]:
%%time
xgb_scoring_1 = {}
for depth in max_depth:
    print(depth, end=" ")
    estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=depth, n_estimators=175, min_child_weight=3, seed=0)
    score = cross_val_score(estimator, pd_token_matrix.as_matrix(), np.array(new_train["label"]), 
                            scoring = 'accuracy', n_jobs=-1, cv = 4).mean()    
    xgb_scoring_1[depth] = score
xgb_scoring_1 = np.asmatrix(xgb_scoring_1)

In [31]:
print(xgb_scoring_1)

[[{16: 0.7923228106823954, 18: 0.7987285737691487, 4: 0.7579144220529204, 6: 0.7635182784467928, 8: 0.767916365609896, 10: 0.7667202076677316, 12: 0.7763221409846809, 14: 0.7835202404358156}, {20: 0.7959253747849595, 22: 0.7943260055705742}]]

##### best max_depth: 18

##### tune min_child_weight

In [36]:
min_child_weight = range(1,9,2)

In [37]:
xgb_scoring_2 = {}

In [39]:
%%time
for weight in min_child_weight:
    print(weight, end=" ")
    estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=18, n_estimators=175, min_child_weight=weight, seed=0)
    score = cross_val_score(estimator, pd_token_matrix.as_matrix(), np.array(new_train["label"]), 
                            scoring = 'accuracy',n_jobs=-1 , cv = 4).mean()    
    xgb_scoring_2[weight] = score
xgb_scoring_2 = np.asmatrix(xgb_scoring_2)

In [40]:
print(xgb_scoring_2)

[[{1: 0.8251414250020479, 3: 0.7987285737691487, 5: 0.7667208425493569, 7: 0.7487124866879659}]]

##### best min_child_weight: 1

##### tune subsample

In [12]:
subsample = [s/10.0 for s in range(6, 10)]
xgb_scoring_3 = {}

In [13]:
%%time
for i in subsample:
    print(i, end=" ")
    estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=18, n_estimators=175, min_child_weight=1, subsample=i, seed=0)
    score = cross_val_score(estimator, pd_token_matrix.as_matrix(), np.array(new_train["label"]), 
                            scoring = 'accuracy', n_jobs=-1 , cv = 4).mean()    
    xgb_scoring_3[i] = score
xgb_scoring_3 = np.asmatrix(xgb_scoring_3)

In [14]:
print(xgb_scoring_3)

[[{0.6: 0.8187369460145818, 0.7: 0.8147356578192839, 0.9: 0.8203382239698533, 0.8: 0.8207375768001967}]]

##### best subsample: default

##### tune gamma

In [15]:
xgb_scoring_4 = {}

In [16]:
gamma = [g/10.0 for g in range(0, 5)]
xgb_scoring_4 = {}

In [17]:
%%time
for i in gamma:
    print(i, end=" ")
    estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=18, n_estimators=175, min_child_weight=1, gamma=i, seed=0)
    score = cross_val_score(estimator, pd_token_matrix.as_matrix(), np.array(new_train["label"]), 
                            scoring = 'accuracy', n_jobs=-1 , cv = 4).mean()    
    xgb_scoring_4[i] = score
xgb_scoring_4 = np.asmatrix(xgb_scoring_4)

In [18]:
print(xgb_scoring_4)

[[{0.0: 0.8251414250020479, 0.1: 0.8203426947652985, 0.2: 0.8239375829442124, 0.4: 0.8227350311296797, 0.3: 0.8255407880724175}]]

##### best gamma: 0.3

##### tune colsample_bytree

In [20]:
collsample = [k/10.0 for k in range(6, 10)]
xgb_scoring_5 = {}

In [21]:
%%time
for i in collsample:
    print(i, end=" ")
    estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=18, n_estimators=175, min_child_weight=1, colsample_bytree=i, 
                                  gamma=0.3, seed=0)
    score = cross_val_score(estimator, pd_token_matrix.as_matrix(), np.array(new_train["label"]), 
                            scoring = 'accuracy', n_jobs=-1 , cv = 4).mean()    
    xgb_scoring_5[i] = score
xgb_scoring_5 = np.asmatrix(xgb_scoring_5)

In [22]:
print(xgb_scoring_5)

[[{0.6: 0.8215337367903662, 0.7: 0.8207433439829606, 0.9: 0.8207446239862375, 0.8: 0.8167414127140165}]]

##### best colsample_bytree: default

##### tune regularization

In [23]:
reg = [1e-5, 1e-4, 1e-2, 1, 10]
xgb_scoring_6 = {}

In [24]:
%%time
for i in reg:
    print(i, end=" ")
    estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=18, n_estimators=175, min_child_weight=1, gamma=0.3, 
                                  reg_alpha=i, seed=0)
    score = cross_val_score(estimator, pd_token_matrix.as_matrix(), np.array(new_train["label"]), 
                            scoring = 'accuracy', n_jobs=-1 , cv = 4).mean()    
    xgb_scoring_6[i] = score
xgb_scoring_6 = np.asmatrix(xgb_scoring_6)

In [25]:
print(xgb_scoring_6)

##### best regularization: 0.0001

### Final best xgboost parameters: xgb.XGBClassifier(learning_rate=0.1, max_depth=18, n_estimators=175, min_child_weight=1, gamma=0.3, reg_alpha=0,0001, seed=0); <br> final score: 0.8287

In [71]:
%%time
print cross_val_score(xgb.XGBClassifier(learning_rate=0.2, max_depth=16, n_estimators=200, min_child_weight=3, seed=0), 
                      pd_token_matrix.as_matrix(), np.array(new_train["label"]), scoring = 'accuracy', n_jobs=-1, cv = 4).mean() 

### Test dataset

In [38]:
print(test.head())
print(test.shape)

#### Best linear classifier: <br>  LinearSVC(max_iter=600, loss='squared_hinge') with CountVectorizer(min_df=1, ngram_range=(1, 3), max_df=0.85, stop_words=None); <br><br> Best tree classifier: <br>  RandomForestClassifier(n_estimators=32, min_samples_split=3, max_features='log2', min_impurity_split=1e-08, random_state=1) and GradientBoostingClassifier(learning_rate=0.05, n_estimators= 160 , max_depth=16,min_samples_split=30,max_features=390, subsample=0.8, random_state=1))

In [76]:
print(test["text"].shape)

In [57]:
Lin_SVC = text_classifier(vectorizer=CountVectorizer(min_df=1, ngram_range=(1, 5), max_df=0.85, stop_words=None), 
                transformer=TfidfTransformer(), 
                classifier=LinearSVC(max_iter=400, loss='squared_hinge', C= 1.1, tol=1e-05, random_state=1))

Lin_SVC.fit(new_train["text"], new_train["label"])
print(Lin_SVC)

##### make prediction

In [59]:
lin_svc_result = Lin_SVC.predict(test["text"])
print(lin_svc_result)

In [60]:
with open("answer_lin_csv_4.csv", 'w') as f_out:
        f_out.write(pd.DataFrame(pd.Series(map(str, range(0, 500))).str.cat(map(str, lin_svc_result), sep=','), 
                                 columns = ["Id,y"]).to_csv(sep=" ", index=False))

#pd.DataFrame(lin_svc_result).to_csv()

In [43]:
rf_class = text_classifier(vectorizer=CountVectorizer(), 
                transformer=TfidfTransformer(), 
                classifier=RandomForestClassifier(n_estimators=32, min_samples_split=3, max_features='log2', 
                                                  min_impurity_split=1e-08, random_state=1))

rf_class.fit(new_train["text"], new_train["label"])
print rf_class

In [45]:
rf_result = rf_class.predict(test["text"])
print rf_result

In [120]:
with open("answer_rf_1.csv", 'w') as f_out:
        f_out.write(pd.DataFrame(pd.Series(map(str, range(0, 500))).str.cat(map(str, rf_result), sep=','), 
                                 columns = ["Id,y"]).to_csv(sep=" ", index=False))        

In [95]:
res_example = pd.read_csv("products_sentiment_sample_submission.csv", header = 0, sep="\t")

In [96]:
res_example

### Best result:

In [None]:
vectorizer=CountVectorizer(min_df=1, ngram_range=(1, 3), max_df=0.85, stop_words=None), 
                transformer=TfidfTransformer(), 
                classifier=LinearSVC(max_iter=400, loss='squared_hinge', C= 1.1, tol=1e-05, random_state=1)

In [39]:
from IPython.display import Image
Image("./data/result.jpg")

##### it may be possible try to improve result using neural networks or try to somehow transforme features - like reduce space dimensionality

##### dimensionaly reduction

In [None]:
from sklearn.decomposition import PCA

In [None]:
model = PCA(n_components=2500, svd_solver='full')
model.fit(pd.DataFrame(frequency_counts.A, columns=cvect.get_feature_names()))
reduced_token_mtx = model.transform(pd.DataFrame(frequency_counts.A, columns=cvect.get_feature_names()))
print(reduced_token_mtx.shape)

In [None]:
cvect_r = CountVectorizer(ngram_range=(1, 2))
token_mtx_r = cvect_r.fit_transform(new_train["text"])

tfidf_r = TfidfTransformer()
freq_r = tfidf_r.fit_transform(token_mtx_r)

model_ = PCA(n_components=2500, svd_solver='full')
model_.fit(pd.DataFrame(freq_r.A, columns=cvect_r.get_feature_names()))
reduced_pd = model_.transform(reduce_pd_token)

In [None]:
print(cross_val_score(RandomForestClassifier(n_estimators=36, min_samples_split=3, 
                                             max_features='log2', min_impurity_split=1e-08, 
                                             random_state=1), 
                      reduced_pd, new_train["label"], scoring= 'accuracy', cv=4).mean())