In [109]:
import pandas as pd
import numpy as np
import tqdm

from datetime import datetime

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

import googlemaps
import foursquare


from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import roc_auc_score

import nltk, string
from nltk.tokenize import punkt
from nltk.tokenize import word_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
import pymorphy2
from string import punctuation
exclude = set(punctuation + u'[]—«»–')

%matplotlib inline

**API Tokens**

In [83]:
gmaps = googlemaps.Client(key='')
foursquare_client_id = ''
foursquare_client_secret = ''

## Google places - подгружаем ЛУПы

In [87]:
google_places_ids_json = gmaps.places_radar(location=[55.7522200, 37.6155600], radius=50000, type='hospital')

In [150]:
data = pd.DataFrame()
count = 0
for res in google_places_ids_json.get('results'):
    data.loc[count, 'lat'] = res.get('geometry').get('location').get('lat')
    data.loc[count, 'long'] = res.get('geometry').get('location').get('lng')
    data.loc[count, 'place_id'] = res.get('place_id')
    count += 1

In [152]:
data.head(3)

Unnamed: 0,lat,long,place_id
0,55.664012,37.522861,ChIJxUJiGhlNtUYRjyucIeHbeVI
1,55.756777,37.638748,ChIJB437ifVKtUYR7EMUU6VApLg
2,55.755245,37.656547,ChIJ2acH14xKtUYRCFp01z3Cgbk


## Google places - подгружаем отзывы

In [102]:
def get_reviews_from_google(place_id, language = 'ru-RU'):
    
    '''
    
    Функция для загрузки отзывов о месте на карте
    
    params: place_id - id места
    params: language - язык 
    
    '''
    
    detail_info_about_place = gmaps.place(place_id, language)    
    
    # имя места
    if 'name' in detail_info_about_place['result']:
        place_name = detail_info_about_place['result']['name']
    else:
        place_name = None
    
    # общий рейтинг
    if 'rating' in detail_info_about_place['result']:
        place_rating = detail_info_about_place['result']['rating']
    else:
        place_rating = None
        
    # отзывы
    place_reviews = []
    if 'reviews' in detail_info_about_place['result']:
        for review in detail_info_about_place['result']['reviews']:
            place_reviews.append([review['text'], review['rating'], review['time']])
            
    # сайт
    if 'website' in detail_info_about_place['result']:
        place_website = detail_info_about_place['result']['website']
    else:
        place_website = None
            
    return place_name, place_rating, place_website, place_reviews

In [153]:
# загрузим отзывы

google_reviews_df = pd.DataFrame()
count = 0
for place_id in data['place_id'].unique():
    result = get_reviews_from_google(place_id)
    
    if result == None:
        continue
    else:
        reviews = result[3]
        
    for review in reviews:
        google_reviews_df.loc[count, 'place_id'] = place_id
        google_reviews_df.loc[count, 'place_name'] = result[0]
        google_reviews_df.loc[count, 'place_rating'] = result[1]
        google_reviews_df.loc[count, 'place_website'] = result[2]
        google_reviews_df.loc[count, 'text'] = review[0]
        google_reviews_df.loc[count, 'rating'] = review[1]
        google_reviews_df.loc[count, 'time'] = review[2]
        count += 1

In [154]:
google_reviews_df.head(3)

Unnamed: 0,place_id,place_name,place_rating,place_website,text,rating,time
0,ChIJxUJiGhlNtUYRjyucIeHbeVI,"Родильный дом №4, филиал №1 ГКБ им. В.В. Виног...",4.2,http://www.roddom4.ru/,"Рожала первый раз, пришли ночью, когда отошли ...",5.0,1500051000.0
1,ChIJxUJiGhlNtUYRjyucIeHbeVI,"Родильный дом №4, филиал №1 ГКБ им. В.В. Виног...",4.2,http://www.roddom4.ru/,Самый лучший роддом столицы!!! Самый лучший ко...,5.0,1504410000.0
2,ChIJxUJiGhlNtUYRjyucIeHbeVI,"Родильный дом №4, филиал №1 ГКБ им. В.В. Виног...",4.2,http://www.roddom4.ru/,"Роддом #4 порадовал своей приятной атмосферой,...",5.0,1480424000.0


In [155]:
google_reviews_df.to_csv('data/reviews.csv', sep=';')

## NLP - обучим классификатор тональности текста

In [19]:
# Возьмем нужные столбцы
google_reviews_df = google_reviews_df[google_reviews_df.text != '']

def make_reg(x):
    if x > 3:
        return 1
    return 0

xtr, xtt, ytr, ytt = train_test_split(google_reviews_df.text, google_reviews_df.rating.map(lambda x: make_reg(x)), test_size = 0.2)

In [20]:
def preprocess(text):
    
    '''
    
    Функция предобработки текста:
        1. Разбиваем на токены;
        2. Убираем стоп-слова;
        3. Лемматизируем.
        
    params text: исходный текст
    
    '''
    
    buf = ''.join(ch for ch in text if ch not in exclude)
    tokens = WhitespaceTokenizer().tokenize(buf.lower())
    lemmatizer = MorphAnalyzer()
    lemmas = []
    for t in tokens[:]:
        if not t in stopwords.words('russian'):
            try:
                lemma = lemmatizer.parse(t)[0].normal_form
            except: 
                lemma = t
            lemmas.append(lemma)
    return lemmas

In [21]:
# Разбиваем на n-граммы и векторизируем комментарии
vectorizer = TfidfVectorizer(ngram_range=(1,3), tokenizer=preprocess)
vectors = vectorizer.fit_transform(xtr).toarray()
vectors_test = vectorizer.transform(xtt).toarray()

print('Data prepared.')

Data prepared.


In [22]:
# Обучим классификатор
classifier = CatBoostClassifier()
classifier.fit(vectors, ytr)

print('Model fitted.')

Model fitted.


In [28]:
# Проверим результат
yt = 0
for i, v in enumerate(classifier.predict(vectors_test)):
    if v == ytt.values[i]:
        yt += 1
        
print('Accuracy:', "{0:.2f}%".format(yt / len(ytt) * 100))

Accuracy: 90.68%


In [114]:
# обучим на всей выборке
google_reviews_df.rating = google_reviews_df.rating.map(lambda x: make_reg(x))
classifier = CatBoostClassifier()
classifier.fit(vectorizer.transform(google_reviews_df.text).toarray(), google_reviews_df.rating)

In [32]:
# сохраним модель
classifier.save_model('models/text_tonality_model')

## 4Square - собираем дополнительные отзывы

In [121]:
def get_reviews_from_fsq(place_name):
    
    '''
    
    Функция для загрузки отзывов с foursquare.com
    
    params place_name: название места
    
    '''
    
    client = foursquare.Foursquare(client_id=foursquare_client_id, client_secret=foursquare_client_secret)

    venues = client.venues.search(params={'near': 'Moscow, Russia', 'query': place_name})
    reviews = []

    if len(venues['venues']) > 0 and venues['venues'][0]['stats']['tipCount'] > 0:
        venue_id = venues['venues'][0]['id']
        venue_tips = client.venues.tips(venue_id, params={'sort': 'recent', 'limit':500})
        for tip in venue_tips['tips']['items']: 
            reviews.append([tip['text'], tip['createdAt']])
    
    if len(venues['venues']) > 0:
        
        if 'url' in venues['venues'][0]:
            website = venues['venues'][0]['url']
        else:
            website = None
        
        if 'stats' in venues['venues'][0]:
            checkins = venues['venues'][0]['stats']['checkinsCount'] 
            users = venues['venues'][0]['stats']['usersCount'] 
        else:
            checkins = None
            users = None
            
    else:
        website = None
        checkins = None
        users = None
    
    return website, reviews, checkins, users

In [130]:
# загрузим отзывы с 4square

foursquare_df = pd.DataFrame(columns=['place_name', 'place_website', 'text', 'checkins', 'users'])
count = 0

for place_name in google_reviews_df.place_name.unique():
    result = get_reviews_from_fsq(place_name)
    
    foursquare_df.loc[count, 'place_name'] = place_name
    
    if result is None:
        continue
        
    foursquare_df.loc[count, 'place_website'] = result[0]
    foursquare_df.at[count, 'text'] = result[1]
    foursquare_df.loc[count, 'checkins'] = result[2]
    foursquare_df.loc[count, 'users'] = result[3]
    count += 1

In [141]:
foursquare_df.dropna().head(3)

Unnamed: 0,place_name,place_website,text,checkins,users
1,Национальный медицинский исследовательский цен...,https://www.gnicpm.ru,"[[В кабинете 101 институтская столовая, гардер...",67,31
3,МОБИЛМЕД,https://www.mobil-med.org,[[У них тут даже Густав Климт висит. Ненастоящ...,1148,802
5,"Семейный доктор, Поликлиника № 9",http://www.fdoctor.ru/clinics/michurinski/,[[Ухудшилось качество обслуживания! Новые докт...,1255,351


In [132]:
foursquare_df.to_csv('data/foursquare_reviews.csv', sep=';')

## Добавим отзывы в датафрейм

In [161]:
# добавим общий рейтинг, имя места, сайт
data = data.merge(google_reviews_df[['place_id', 'place_name', 'place_website', 'place_rating']].rename(columns={'rating':'Рейтинг'}), on=['place_id'], how = 'left').drop_duplicates().dropna()
data.head(3)

Unnamed: 0,lat,long,place_id,place_name,place_website,place_rating
0,55.664012,37.522861,ChIJxUJiGhlNtUYRjyucIeHbeVI,"Родильный дом №4, филиал №1 ГКБ им. В.В. Виног...",http://www.roddom4.ru/,4.2
5,55.756777,37.638748,ChIJB437ifVKtUYR7EMUU6VApLg,Национальный медицинский исследовательский цен...,http://www.gnicpm.ru/,4.3
10,55.755245,37.656547,ChIJ2acH14xKtUYRCFp01z3Cgbk,МедЦентрСервис,http://medtsentrservis.ru/,4.7


In [195]:
# добавим аггрегированный рейтинг по отзывам (4square)

for i, row in tqdm.tqdm(foursquare_df.iterrows(), total = foursquare_df.shape[0]):
    texts = row.text
    if len(texts) == 0:
        continue
        
    scores = []
    timestamps = []
    for text in texts:
        
        timestamps.append(datetime.fromtimestamp(text[1]).year)
        text = text[0]
        
        a = pd.DataFrame()
        a.loc[0, 'a'] = text
        
        scores.append(classifier.predict(vectorizer.transform(a.a).toarray()))
    
    
    max_year = np.max(timestamps)
    timestamp_coefs = []
    for timestamp in timestamps:
        timestamp_coefs.append(1 / (max_year + 1 - timestamp))
        
    final_scores = []
    for i in range(len(scores)):
        final_scores.append(scores[i] * timestamp_coefs[i])
        
    data.loc[data[data.place_name == row.place_name].index, 'review_mean'] = np.mean(final_scores)
    data.loc[data[data.place_name == row.place_name].index, 'review_std'] = np.std(final_scores)
    data.loc[data[data.place_name == row.place_name].index, 'checkins'] = row.checkins
    data.loc[data[data.place_name == row.place_name].index, 'users'] = row.users
    data.loc[data[data.place_name == row.place_name].index, 'place_website_4q'] = row.place_website

100%|████████████████████████████████████████████████████████████████████████████████| 147/147 [04:01<00:00,  1.64s/it]


In [198]:
data.head(3)

Unnamed: 0,lat,long,place_id,place_name,place_website,place_rating,review_mean,review_std,checkins,users,place_website_4q
0,55.664012,37.522861,ChIJxUJiGhlNtUYRjyucIeHbeVI,"Родильный дом №4, филиал №1 ГКБ им. В.В. Виног...",http://www.roddom4.ru/,4.2,,,,,
5,55.756777,37.638748,ChIJB437ifVKtUYR7EMUU6VApLg,Национальный медицинский исследовательский цен...,http://www.gnicpm.ru/,4.3,0.25,0.433013,67.0,31.0,https://www.gnicpm.ru
10,55.755245,37.656547,ChIJ2acH14xKtUYRCFp01z3Cgbk,МедЦентрСервис,http://medtsentrservis.ru/,4.7,,,,,


In [197]:
data.to_csv('data/data.csv', encoding='cp1251', sep=';', index = False)