In [22]:
import requests
import json
from secret import access_token, no_fetch_users

base_url = 'https://api.vk.com/method/'


def execute(method, params):
    url = base_url + method + '?'
    for k, v in params.items():
        url += k + '=' + str(v) + '&'
    url += f'access_token={access_token}&v=5.103'
    r = requests.get(url).text
    response = None
    err = None
    try:
        response = json.loads(r)
    except Exception as e:
        err = str(e)
    try:
        if 'response' not in response:
            err = response['error']
            err = str(err['error_code']) + ': ' + err['error_msg']
            response = None
        else:
            response = response['response']
    except Exception as e:
        err = f'error while parsing: {e}'
    return err, response


error, result = execute('users.get', {'user_ids': 1})
print(error)
print(result)


None
[{'id': 1, 'first_name': 'Pavel', 'last_name': 'Dourov', 'is_closed': False, 'can_access_closed': True}]


In [30]:
from tqdm.notebook import tqdm

user_ids = []
for i in tqdm(range(4)):
    err, conv = execute('messages.getConversations', {'count': 200, 'offset': i * 200})
    if err is None:
        conv_list = conv['items']
        if len(conv_list) == 0:
            break
        for con in conv_list:
            user_id = con['conversation']['peer']['id']
            if user_id not in no_fetch_users:
                user_ids.append(user_id)

len(user_ids)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




799

In [31]:
import time
import pickle

attach_marker_photo = ' (PH)'
attach_marker_video = ' (VD)'

def is_text_worth(text):
    return len(text) != 0 \
            and 'CRASH REPORT' not in text \
            and 'android.' not in text \
            and '[service]' not in text \
            and 'DEVICE INFORMATION' not in text \
            and 'com.twoeightnine.root.xvii.' not in text \
            and 'okhttp3.internal.' not in text \
            and '[longpoll]' not in text

messages = []
for user_id in tqdm(user_ids):
    err, conv = execute('messages.getHistory', {'count': 200, 'user_id': user_id})
    if err is None:
        for mess in conv['items']:
            if mess['out'] == 0:
                text = mess['text']
                if is_text_worth(text):
                    if len(mess['attachments']) != 0:
                        attach = mess['attachments'][0]
                        marker = ''
                        if attach['type'] == 'photo':
                            marker = attach_marker_photo
                        elif attach['type'] == 'video':
                            marker = attach_marker_video
                        text += marker
                    messages.append(text)
    time.sleep(.33)

HBox(children=(FloatProgress(value=0.0, max=799.0), HTML(value='')))




In [32]:
pickle.dump(messages, open('messages.pkl', 'wb'))

messages = pickle.load(open('messages.pkl', 'rb'))
len(messages), messages[:20]

(1634,
 ['Хай',
  'Как сделать свой стиль ?',
  'Если скоро не будет , то удалю приложение нафиг',
  'Окей',
  'Не работает',
  'Как сделать *был в сети недавно*',
  'Привет',
  'Салам',
  'Спасибо, помогло)',
  'Здравствуйте, не грузится список диалогов (PH)',
  'А, спасибо',
  'можно как нибудь поменять ник друзей а чате?',
  'Спасибо большое',
  'А можете объяснить?',
  'А что это за сообщество ?',
  'Привет',
  'Уже прочитал',
  ')',
  'Здравствуйте, а вы же надеюсь не крадёте мои данные?)))',
  'Крутое приложение!'])

In [33]:
import pandas as pd

clean_messages = []
for mess in messages:
    if attach_marker_photo in mess or attach_marker_video in mess:
        clean_messages.append(mess[:-5])
    else:
        clean_messages.append(mess)

df = pd.DataFrame()
df['messages'] = clean_messages
df['has_photo'] = [1 if attach_marker_photo in mess else 0 for mess in messages]
df['has_video'] = [1 if attach_marker_video in mess else 0 for mess in messages]
df.to_csv('messages.csv', index=False)
df.head(20)

Unnamed: 0,messages,has_photo,has_video
0,Хай,0,0
1,Как сделать свой стиль ?,0,0
2,"Если скоро не будет , то удалю приложение нафиг",0,0
3,Окей,0,0
4,Не работает,0,0
5,Как сделать *был в сети недавно*,0,0
6,Привет,0,0
7,Салам,0,0
8,"Спасибо, помогло)",0,0
9,"Здравствуйте, не грузится список диалогов",1,0
