In [1]:
import pandas as pd
import numpy as np
import pprint as pp
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

import xgboost as xgb
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# uncomment and run it first!

# import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# %load nlp_process_data.py

def process_data(data, func):
    pr_data = data.copy()
    index = data.shape[0]
    for i in range(index):
        pr_data[i] = " ".join([func(w) for w in data[i].split()])
    return pr_data


def build_pipe(vect, model, stopwords=None, ngram_range=(1,1), analyzer='word'):
    return Pipeline([("count", vect(stop_words=stopwords, ngram_range=ngram_range, analyzer=analyzer)),
                     ("model", model())])


def train_test_models(X_train, y_train, models_cls=None, vectorizer_cls=None, 
                      random_state=None, min_df=None,stopwords=None,
                      ngram_range=(1,1), analyzer='word',
                      vectorizer_names=None, model_names=None):
    results = list()
    mean = list()
    if not models_cls or not vectorizer_names:
        models_cls = [LogisticRegression,
                      RandomForestClassifier,
                      LinearSVC,
                      SGDClassifier]
        model_names = ["LogReg", "RF Clas.", "LinearSVC", "SGD Clas."]
    if not vectorizer_cls or not model_names:
        vectorizer_cls = [TfidfVectorizer, CountVectorizer]
        vectorizer_names = ["TfidfVec", "CntVec"]
       
    _vectorizer_names = iter(vectorizer_names)
    for vectorizer in vectorizer_cls:
        vector_name = _vectorizer_names.__next__()
        _model_names = iter(model_names)
        for model in models_cls:
            pipe = build_pipe(vectorizer, model, stopwords=stopwords,
                              ngram_range=ngram_range, analyzer=analyzer)
            score = cross_val_score(pipe, X_train, y_train, scoring='accuracy')
            mean.append(np.mean(score))
            results.append("model: {}, vectorizer: {}  scores: {}, mean: {}".format(_model_names.__next__(),
                                                                                       vector_name, score,
                                                                                       round(np.mean(score),4)))
            
    return results, mean

In [3]:
path = "/media/winter/vm/github/kaggle/Product_Sentiment/"
data = open(path+"test.csv", "r").readlines()

In [5]:
data = " ".join(data).split("</review>")[:-1]
X_ = [text.replace('<review>', "").replace("\n", "") for text in data]
stop_words = stopwords.words('russian')

In [166]:
%%writefile parse_yandex_market.py

from multiprocessing import Pool
import time
import random
import requests
import bs4

def parse_page(url):
    time.sleep(random.randint(5,30))
    
    text = requests.get(url)
    text.encoding = "utf-8"
    
    parser = bs4.BeautifulSoup(text.text, 'lxml')
    data = parser.findAll('div', attrs={"class":"n-snippet-card-review__right"})
    
    full_revs = [" ".join([t.text for t in i.find_all('dd')]) for i in data]

    rating_revs = [1 if float(i.text) > 3 else 0  for i in 
                   parser.findAll('div', attrs={'class':'rating__value'})]
    
    return list(zip(full_revs, rating_revs))

url = """https://market.yandex.ru/catalog--mobilnye-telefony-otzyvy-pokupatelei/54726/
list?show-reviews=1&page={}&local-offers-first=0&onstock=0&how=quality"""

pool = Pool(3)
url_list = [url.format(i) for i in range(1,51)]

if __name__ == "__main__":
    map_results = pool.map(parse_page, url_list)
    with open('parsing_revievs_yandexMarket.txt','w+') as out_f:
        out_f.write("\n".join(map_results))

Overwriting parse_yandex_market.py


33
