In [193]:
import pandas as pd
import numpy as np
import pprint as pp
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

import xgboost as xgb
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# uncomment and run it first!

# import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [117]:
def process_data(data, func):
    pr_data = data.copy()
    index = data.shape[0]
    for i in range(index):
        pr_data[i] = " ".join([func(w) for w in data[i].split()])
    return pr_data


def build_pipe(vect, model, stopwords=None, ngram_range=(1,1), analyzer='word'):
    return Pipeline([("count", vect(stop_words=stopwords, ngram_range=ngram_range, analyzer=analyzer)),
                     ("model", model())])


def train_test_models(X_train, y_train, models_cls=None, vectorizer_cls=None, 
                      random_state=None, min_df=None,stopwords=None,
                      ngram_range=(1,1), analyzer='word',
                      vectorizer_names=None, model_names=None):
    results = list()
    mean = list()
    if not models_cls:
        models_cls = [LogisticRegression,
                      RandomForestClassifier,
                      LinearSVC,
                      SGDClassifier]
        model_names = ["LogReg", "RF Clas.", "LinearSVC", "SGD Clas."]
    if not vectorizer_cls:
        vectorizer_cls = [TfidfVectorizer, CountVectorizer]
        vectorizer_names = ["TfidfVec", "CntVec"]
       
    _vectorizer_names = iter(vectorizer_names)
    for vectorizer in vectorizer_cls:
        vector_name = _vectorizer_names.__next__()
        _model_names = iter(model_names)
        for model in models_cls:
            pipe = build_pipe(vectorizer, model, stopwords=stopwords,
                              ngram_range=ngram_range, analyzer=analyzer)
            score = cross_val_score(pipe, X_train, y_train, scoring='accuracy')
            mean.append(np.mean(score))
            results.append("model: {}, vectorizer: {}  scores: {}, mean: {}".format(_model_names.__next__(),
                                                                                       vector_name, score,
                                                                                       round(np.mean(score),4)))
            
    return results, mean


In [111]:
path = "/media/winter/vm/github/kaggle/Product_Sentiment/"
train = pd.read_csv(path + "products_sentiment_train.tsv", sep="\t", names=['text','bin'])
test = pd.read_csv(path + "products_sentiment_test.tsv", sep="\t",names=['text'])
sample = pd.read_csv(path + "products_sentiment_sample_submission.csv", sep="\t")

test.drop(test.index[[0]],axis=0, inplace=True)

# Default words
X_train = train['text']
y_train = train['bin']
kaggle_X_test = test["text"]

# prosessed words
lemma_X_train = process_data(train['text'], WordNetLemmatizer().lemmatize)
snow_X_train = process_data(train['text'], SnowballStemmer('english').stem)
porter_X_train = process_data(train['text'], PorterStemmer().stem)


In [26]:
# Each stem/lemm has different word handling
print(lemma_X_train[1], end='\n \n')
print(snow_X_train[1], end='\n \n')
print(porter_X_train[1])

i downloaded a trial version of computer associate ez firewall and antivirus and fell in love with a computer security system all over again .
 
i download a trial version of comput associ ez firewal and antivirus and fell in love with a comput secur system all over again .
 
i download a trial version of comput associ ez firewal and antiviru and fell in love with a comput secur system all over again .


In [27]:
print("CV+LogR", cross_val_score(build_pipe(CountVectorizer, LogisticRegression), 
                                X_train, y_train, scoring="accuracy"))
print("Tfidf+ RandForesCls", cross_val_score(build_pipe(TfidfVectorizer, RandomForestClassifier),
                                            X_train, y_train, scoring="accuracy"))
print("Tfidf+ LiearSVC",cross_val_score(build_pipe(TfidfVectorizer, LinearSVC), 
                                        X_train, y_train, scoring="accuracy"))
print("Tfidf+ SGDcls",cross_val_score(build_pipe(CountVectorizer, SGDClassifier),
                                      X_train, y_train, scoring="accuracy"))

CV+LogR [0.76611694 0.74512744 0.77777778]
Tfidf+ RandForesCls [0.69715142 0.69265367 0.71171171]
Tfidf+ LiearSVC [0.79610195 0.77061469 0.78378378]
Tfidf+ SGDcls [0.76611694 0.75712144 0.75525526]


In [118]:
"""
train_test_models(X_train, y_train, models_cls=None, vectorizer_names=None, 
                      random_state=None, min_df=None,stopwords=None,
                      ngram_range=(1,1), analyzer='word')
                      
                      
datasets = [lemma_X_train, snow_X_train, porter_X_train]

"""

lemma_test = train_test_models(lemma_X_train, y_train, min_df=5)
snow_test = train_test_models(snow_X_train, y_train, min_df=5)
porter_test = train_test_models(porter_X_train, y_train, min_df=5)
default_test = train_test_models(X_train, y_train, min_df=5)

I'm going to take and look at all of these model setups with different NL processing.   
I woudn't take RandomForest if I doesn't know that it can be heavely hyperparam setted.

In [127]:
pp.pprint(lemma_test[0], width=50, indent=1)
print("\n overall mean {}".format(np.mean(lemma_test[1])).upper())

['model: LogReg, vectorizer: TfidfVec  scores: '
 '[0.78710645 0.74062969 0.77027027], mean: '
 '0.766',
 'model: RF Clas., vectorizer: TfidfVec  scores: '
 '[0.71814093 0.70614693 0.7042042 ], mean: '
 '0.7095',
 'model: LinearSVC, vectorizer: TfidfVec  '
 'scores: [0.7916042  0.74812594 0.77177177], '
 'mean: 0.7705',
 'model: SGD Clas., vectorizer: TfidfVec  '
 'scores: [0.76611694 0.75262369 0.75975976], '
 'mean: 0.7595',
 'model: LogReg, vectorizer: CntVec  scores: '
 '[0.7916042  0.75112444 0.77327327], mean: '
 '0.772',
 'model: RF Clas., vectorizer: CntVec  scores: '
 '[0.70314843 0.70614693 0.73123123], mean: '
 '0.7135',
 'model: LinearSVC, vectorizer: CntVec  scores: '
 '[0.76911544 0.73613193 0.76126126], mean: '
 '0.7555',
 'model: SGD Clas., vectorizer: CntVec  scores: '
 '[0.73613193 0.73763118 0.74174174], mean: '
 '0.7385']

 OVERALL MEAN 0.7481267812039926


In [130]:
pp.pprint(snow_test[0], width=50, indent=1)
print("\n overall mean {}".format(np.mean(snow_test[1])).upper())

['model: LogReg, vectorizer: TfidfVec  scores: '
 '[0.7826087  0.74662669 0.76726727], mean: '
 '0.7655',
 'model: RF Clas., vectorizer: TfidfVec  scores: '
 '[0.72113943 0.68515742 0.73273273], mean: '
 '0.713',
 'model: LinearSVC, vectorizer: TfidfVec  '
 'scores: [0.79310345 0.75712144 0.79129129], '
 'mean: 0.7805',
 'model: SGD Clas., vectorizer: TfidfVec  '
 'scores: [0.76011994 0.75412294 0.75525526], '
 'mean: 0.7565',
 'model: LogReg, vectorizer: CntVec  scores: '
 '[0.7916042  0.74512744 0.78978979], mean: '
 '0.7755',
 'model: RF Clas., vectorizer: CntVec  scores: '
 '[0.73763118 0.70014993 0.74024024], mean: '
 '0.726',
 'model: LinearSVC, vectorizer: CntVec  scores: '
 '[0.78110945 0.74212894 0.75525526], mean: '
 '0.7595',
 'model: SGD Clas., vectorizer: CntVec  scores: '
 '[0.7826087  0.72413793 0.75675676], mean: '
 '0.7545']

 OVERALL MEAN 0.7538785974880428


In [131]:
pp.pprint(porter_test[0], width=50, indent=1)
print("\n overall mean {}".format(np.mean(porter_test[1])).upper())

['model: LogReg, vectorizer: TfidfVec  scores: '
 '[0.7826087  0.74962519 0.76726727], mean: '
 '0.7665',
 'model: RF Clas., vectorizer: TfidfVec  scores: '
 '[0.74362819 0.71814093 0.71771772], mean: '
 '0.7265',
 'model: LinearSVC, vectorizer: TfidfVec  '
 'scores: [0.7946027  0.75112444 0.77927928], '
 'mean: 0.775',
 'model: SGD Clas., vectorizer: TfidfVec  '
 'scores: [0.7886057  0.75712144 0.74624625], '
 'mean: 0.764',
 'model: LogReg, vectorizer: CntVec  scores: '
 '[0.79310345 0.73913043 0.78528529], mean: '
 '0.7725',
 'model: RF Clas., vectorizer: CntVec  scores: '
 '[0.72713643 0.69265367 0.70870871], mean: '
 '0.7095',
 'model: LinearSVC, vectorizer: CntVec  scores: '
 '[0.7856072  0.73613193 0.75225225], mean: '
 '0.758',
 'model: SGD Clas., vectorizer: CntVec  scores: '
 '[0.77211394 0.73613193 0.75675676], mean: '
 '0.755']

 OVERALL MEAN 0.7533741575158367


In [132]:
pp.pprint(default_test[0], width=50, indent=1)
print("\n overall mean {}".format(np.mean(default_test[1])).upper())

['model: LogReg, vectorizer: TfidfVec  scores: '
 '[0.76611694 0.73763118 0.76876877], mean: '
 '0.7575',
 'model: RF Clas., vectorizer: TfidfVec  scores: '
 '[0.72113943 0.68665667 0.73123123], mean: '
 '0.713',
 'model: LinearSVC, vectorizer: TfidfVec  '
 'scores: [0.78410795 0.75262369 0.76876877], '
 'mean: 0.7685',
 'model: SGD Clas., vectorizer: TfidfVec  '
 'scores: [0.77061469 0.74362819 0.75525526], '
 'mean: 0.7565',
 'model: LogReg, vectorizer: CntVec  scores: '
 '[0.7916042  0.74212894 0.78828829], mean: '
 '0.774',
 'model: RF Clas., vectorizer: CntVec  scores: '
 '[0.70914543 0.70014993 0.72522523], mean: '
 '0.7115',
 'model: LinearSVC, vectorizer: CntVec  scores: '
 '[0.76011994 0.72563718 0.76576577], mean: '
 '0.7505',
 'model: SGD Clas., vectorizer: CntVec  scores: '
 '[0.74812594 0.70164918 0.71921922], mean: '
 '0.723']

 OVERALL MEAN 0.7443167492830162


In [226]:
# I'm taking defaults, because hyperparameter optimization dosen't show improvements in a models.
# It seems snowball stemmer does better result, still 0.01 is not much. Poor improvements from NLP
# That means we can test another classificator models or imporove training dataset

stop_words=set(stopwords.words('english'))


log = LogisticRegression()
cntvec = TfidfVectorizer(min_df=2, ngram_range=(1,2))

In [227]:
# fit_transform X_train X_test data, train Logistic Regression 

X_train_ = X_train.copy()
y_train = y_train.copy()
kaggle_X_test_ = kaggle_X_test.copy()

cntvec.fit(X_train_)
X_train_ = cntvec.transform(X_train_)

log.fit(X_train_, y_train)
log_pred = log.predict(X_train_)

log_acc = accuracy_score(y_train, log_pred)

print("model score: ",log_acc)
print("vocab capacity:",len(cntvec.vocabulary_))
np.mean(cross_val_score(log, X_train_, y_train, scoring='accuracy', cv=5))

model score:  0.896
vocab capacity: 5367


0.7530068812930081

In [228]:
log_test_pred = pd.DataFrame({'y':log.predict(cntvec.transform(kaggle_X_test_))})
log_test_pred.index.name='id'
log_test_pred.to_csv("submission.csv", sep=',')

In [229]:
# Everything what I've tried seems not working much with such eazy kaggle comp.
# My thoughts - dataset is poor on expression words such as "like", "hate" and others bad/good words

coef = log.coef_.tolist()[0]
most_valuable_features = [coef.index(i) for i in sorted(coef)[:20]]
features = np.array(cntvec.get_feature_names())

pp.pprint(features[most_valuable_features])

array(['not', 'only', 'would', 'after', 'norton', 'they', 'but',
       'problem', 'does not', 'buttons', 'when', 'annoying', 'is not',
       'unfortunately', 'useless', 'times', 'to', 'sometimes', 'them',
       'their'], dtype='<U25')
