In [1]:
import re
import pprint as pp

import pandas as pd
import numpy as np

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

import xgboost as xgb
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# uncomment and run it first!

# import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# %load nlp_process_data.py

def build_pipe(vect, model, stopwords=None, ngram_range=(1,1), analyzer='word'):
    return Pipeline([("count", vect(stop_words=stopwords, ngram_range=ngram_range, analyzer=analyzer)),
                     ("model", model())])


def train_test_models(X_train, y_train, models_cls=None, vectorizer_cls=None, 
                      random_state=None, min_df=None,stopwords=None,
                      ngram_range=(1,1), analyzer='word',
                      vectorizer_names=None, model_names=None):
    results = list()
    mean = list()
    if not models_cls or not vectorizer_names:
        models_cls = [LogisticRegression,
                      RandomForestClassifier,
                      LinearSVC,
                      SGDClassifier]
        model_names = ["LogReg", "RF Clas.", "LinearSVC", "SGD Clas."]
    if not vectorizer_cls or not model_names:
        vectorizer_cls = [TfidfVectorizer, CountVectorizer]
        vectorizer_names = ["TfidfVec", "CntVec"]
       
    _vectorizer_names = iter(vectorizer_names)
    for vectorizer in vectorizer_cls:
        vector_name = _vectorizer_names.__next__()
        _model_names = iter(model_names)
        for model in models_cls:
            pipe = build_pipe(vectorizer, model, stopwords=stopwords,
                              ngram_range=ngram_range, analyzer=analyzer)
            score = cross_val_score(pipe, X_train, y_train, scoring='accuracy')
            mean.append(np.mean(score))
            
            results.append("model: {}, vectorizer: {}  \
                            scores: {}, mean: {}".format(_model_names.__next__(),
                                                         vector_name, score,
                                                         round(np.mean(score),4)))    
    return results, mean

def text_preprocessing(series):
    for i in range(series.shape[0]):
        text = series.iloc[i]
        pattern = re.compile(r"(?u)\w+")
        series.iloc[i] = re.findall(pattern, text.lower())
    return series

def process_data(data, func):
    # Removing all words what is less than 3 letters
    series = data.copy()
    for i in range(data.shape[0]):
        series.iloc[i] = " ".join([func(w) for w in data.iloc[i]])
    return series

In [3]:
path = "/media/winter/vm/github/kaggle/Product_Sentiment/"
data = open(path+"test.csv", "r").readlines()

data = " ".join(data).split("</review>")[:-1]
X_test = [text.replace('<review>', "").replace("\n", "") for text in data]
stop_words = stopwords.words('russian')


data = pd.read_csv("reviews_yandexmarket_aws.csv", sep=',',header=None, names=['mark','rev'])
data2 = pd.read_csv("reviews_yandexmarket_old.csv", sep=',',header=None, names=['mark','rev'])
data3 = pd.read_csv("reviews_yandexmarket_1.csv", sep=',',header=None, names=['mark','rev'])
data4 = pd.read_csv("reviews_yandexmarket.csv", sep=',',header=None, names=['mark','rev'])
full_data = data.append([data2, data3, data4])
full_data.dropna(inplace=True)

# sample = full_data[full_data["mark"]==1].sample(528)
# sample = sample.append(full_data[full_data["mark"]==0])
# sample.reset_index(inplace=True)
# print(sample['mark'].value_counts())
print(full_data.shape)
print(full_data['mark'].value_counts())

(2311, 2)
1.0    1783
0.0     528
Name: mark, dtype: int64


In [4]:
X_train, y_train = full_data['rev'].copy(), full_data['mark'].copy()
X_test_ = X_test.copy() 

# sX_train corrected sampled X_train to sustain equality of frequency in binary classification
"""
WordNetLemmatizer().lemmatize
SnowballStemmer('russian').stem
PorterStemmer().stem
"""
X_train_ = text_preprocessing(X_train)
X_train_ = process_data(X_train_, PorterStemmer().stem)

## dict is had about 4k words
# X_sample_train, y_sample_train = sample['rev'].copy(), sample['mark'].copy()
# sX_train_ = text_preprocessing(X_sample_train)
# sX_train_ = process_data(sX_train_, PorterStemmer().stem)
# sy_train = sample['mark']

X_test_ = text_preprocessing(pd.Series(X_test))
X_test_ = process_data(pd.Series(X_test_), PorterStemmer().stem)

In [5]:
# model = RandomForestClassifier(n_estimators=250, max_depth=5)

# model = xgb.XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=250, min_child_weight=2,
#                           n_jobs=4)

model = LogisticRegression(max_iter=10000, penalty='l2', 
                           class_weight={0:1.15,1:1}, solver='liblinear', n_jobs=3)
vectorizer = CountVectorizer(ngram_range=(1,3), min_df=5, stop_words=stop_words)


X_train__ = X_train_.copy()
y_train = y_train.copy()
kaggle_X_test_ = X_test_.copy()

vectorizer.fit(X_train__)
X_train_transf = vectorizer.transform(X_train__)
model.fit(X_train_transf, y_train)
model_pred = model.predict(X_train_transf)

model_acc = accuracy_score(y_train, model_pred)
cross_acc = cross_val_score(model, X_train_transf, y_train, scoring='accuracy', cv=10)

print("model score: ", model_acc)
print("cross score: ", cross_acc,"mean: ", np.mean(cross_acc))
print("vocab capacity:",len(vectorizer.vocabulary_))

model score:  0.9943747295543055
cross score:  [0.75862069 0.83189655 0.93103448 0.94805195 0.8961039  0.995671
 0.995671   0.87012987 0.73478261 0.71304348] mean:  0.8675005516722158
vocab capacity: 8324


In [6]:
sub_predict = ['pos' if i else 'neg' for i in model.predict(vectorizer.transform(kaggle_X_test_))]
submit = pd.DataFrame({'y':sub_predict})
submit.index.name='id'
submit.to_csv("submission.csv", sep=',')

In [118]:
vectorizer.transform(X_train__)

<100x265 sparse matrix of type '<class 'numpy.int64'>'
	with 2397 stored elements in Compressed Sparse Row format>

In [None]:
max_depth (int) – Maximum tree depth for base learners.
learning_rate (float) – Boosting learning rate (xgb’s “eta”)
n_estimators (int) – Number of trees to fit.
verbosity (int) – The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
silent (boolean) – Whether to print messages while running boosting. Deprecated. Use verbosity instead.
objective (string or callable) – Specify the learning task and the corresponding learning objective or a custom objective function to be used (see note below).
booster (string) – Specify which booster to use: gbtree, gblinear or dart.
nthread (int) – Number of parallel threads used to run xgboost. (Deprecated, please use n_jobs)
n_jobs (int) – Number of parallel threads used to run xgboost. (replaces nthread)
gamma (float) – Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight (int) – Minimum sum of instance weight(hessian) needed in a child.
max_delta_step (int) – Maximum delta step we allow each tree’s weight estimation to be.
subsample (float) – Subsample ratio of the training instance.
colsample_bytree (float) – Subsample ratio of columns when constructing each tree.
colsample_bylevel (float) – Subsample ratio of columns for each level.
colsample_bynode (float) – Subsample ratio of columns for each split.
reg_alpha (float (xgb's alpha)) – L1 regularization term on weights
reg_lambda (float (xgb's lambda)) – L2 regularization term on weights
scale_pos_weight (float) – Balancing of positive and negative weights.
base_score – The initial prediction score of all instances, global bias.
seed (int) – Random number seed. (Deprecated, please use random_state)
random_state (int) – Random number seed. (replaces seed)
missing (float, optional) – Value in the data which needs to be present as a missing value. If None, defaults to np.nan.
importance_type (string, default "gain") – The feature importance type for the feature_importances_ property: either “gain”, “weight”, “cover”, “total_gain” or “total_cover”.


In [336]:
# test_1 = list()
# test_1.append(train_test_models(X_train_, y_train, ngram_range=(1,3), min_df=5))
# test_1.append(train_test_models(X_train_, y_train, ngram_range=(1,4), min_df=5))
# test_1.append(train_test_models(X_train_, y_train, ngram_range=(1,2), min_df=5))

In [367]:
pd.read_csv('submission.csv')

Unnamed: 0,id,y
0,0,pos
1,1,pos
2,2,pos
3,3,pos
4,4,pos
5,5,pos
6,6,pos
7,7,pos
8,8,pos
9,9,pos


In [159]:
%%writefile parse_yandex_market.py
# import pickle
# from multiprocessing import Pool
# import time
# import random
# import requests
# import bs4

# def parse_page(url):
#     time.sleep(random.randint(5,30))
    
#     text = requests.get(url)
#     text.encoding = "utf-8"
    
#     parser = bs4.BeautifulSoup(text.text, 'lxml')
#     data = parser.findAll('div', attrs={"class":"n-snippet-card-review__right"})
    
#     full_revs = [" ".join([t.text for t in i.find_all('dd')]) for i in data]

#     rating_revs = [1 if float(i.text) > 3 else 0  for i in 
#                    parser.findAll('div', attrs={'class':'rating__value'})]
    
#     return list(zip(full_revs, rating_revs))

# url = """https://market.yandex.ru/catalog--mobilnye-telefony-otzyvy-pokupatelei/54726/list?\
# show-reviews=1&page={}&local-offers-first=0&onstock=0&how=quality"""

# pool = Pool(3)
# url_list = [url.format(i) for i in range(2,51)]
# url_list_2 = [url.format(i) for i in range(52,151)]
# if __name__ == "__main__":
#     map_results = pool.map(parse_page, url_list)
#     with open('parsing_revievs_yandexMarket','wb') as out_f:
#         pickle.dump(map_results, out_f)

Overwriting parse_yandex_market.py


In [None]:
# %%writefile parse_YM.py
# import pickle
# from multiprocessing import Pool
# import time
# import random
# import requests
# import bs4

# def parse_page(url):
#     time.sleep(random.randint(15,120))
    
#     text = requests.get(url)
#     text.encoding = "utf-8"
    
#     parser = bs4.BeautifulSoup(text.text, 'lxml')
#     data = parser.findAll('div', attrs={"class":"n-snippet-card-review__right"})
    
#     full_revs = [" ".join([t.text for t in i.find_all('dd')]) for i in data]

#     rating_revs = [1 if float(i.text) > 3 else 0  for i in 
#                    parser.findAll('div', attrs={'class':'rating__value'})]
#     return list(zip(full_revs, rating_revs))

# url = """https://market.yandex.ru/catalog--mobilnye-telefony-otzyvy-pokupatelei/54726/list?\
# show-reviews=1&page={}&local-offers-first=0&onstock=0&how=quality"""

# new_url = """https://market.yandex.ru/catalog--mobilnye-telefony-otzyvy-pokupatelei/54726/list?\
# show-reviews=1&local-offers-first=0&onstock=0&deliveryincluded=0&page={}"""


# pool = Pool(10)
# url_list = [url.format(i) for i in range(2,51)]
# url_list_2 = [new_url.format(i) for i in range(1,52)]
# if __name__ == "__main__":
#     map_results = list(map(parse_page, url_list_2))
#     with open('parsing_revievs_yandexMarket_NEWS','wb') as out_f:
#         pickle.dump(map_results, out_f)

In [None]:
# import pickle
# with open('parsing_revievs_yandexMarket_NEWS','rb') as in_f:
#     new_data = pickle.load(in_f)
# unpack = [sent for nest in new_data for sent in nest]
# X_train_neg = [i[0] for i in unpack ]
# y_train_neg = [i[1] for i in unpack ]