# Загрузка текстовых данных из VK (посты и комментарии)

#### Импорт библиотек

In [None]:
import requests
import pandas as pd
from tqdm import tqdm
import datetime
import time
from typing import List

#### Загрузка комментариев и вывод данных в формате DataFrame

Функция парсинга комментариев

In [None]:
class VkCommentsParser:
    def unix_to_date(ts):
        return datetime.datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d')

    def nes_params(post_id, all_comments):
        nes_dict = {}
        nes_dict = {}
        profiles = all_comments['profiles']
        comments = all_comments['items']
        first_string = ['NONE', 'NONE', 'NONE']
        for comment in comments:
            if len(comment['text']) > 3:
                second_string = [VkCommentsParser.unix_to_date(comment['date']), comment['likes']['count'],
                                comment['text']]
                for profile in profiles:
                    if comment['from_id'] == profile['id']:
                        first_string = [profile['first_name'], profile['last_name']]
                nes_dict[comment['id']] = first_string + second_string
        return nes_dict

    def get_Comments(post_id, owner_id, token, nes_dict={}):
        version = 5.131
        offset = 0
        count = 100
        while offset < 500:
            response = requests.get('https://api.vk.com/method/wall.getComments',
                                params={
                                    'access_token': token,
                                    'v': version,
                                    'owner_id': owner_id,
                                    'post_id': post_id,
                                    'need_likes': 1,
                                    'count': count,
                                    'offset': offset,
                                    'extended': 1
                                }
                                )
            data_comments = response.json()['response']
            tempDict = VkCommentsParser.nes_params(post_id, data_comments)
            nes_dict.update(tempDict)
            offset += 100
            time.sleep(0.5)
        return nes_dict

    def to_df(nes_dict):
        df = pd.DataFrame.from_dict(nes_dict, orient='index',
                                    columns=['name', 'last_name', 'date', 'likes', 'text', 'post_id'])
        return df

Вызов метода и вывод полученных данных.
Необходимо ввести ключ-токен приложения для загрузки данных.

In [None]:
token = '492f35ec492f35ec492f35ecdd4a3926724492f492f35ec2c17df616b6994a83242f520'
owner_id = "-51988192"
post_id = "10796"

# Вызов метода
result = VkCommentsParser.get_Comments(post_id, owner_id, token)

print(result)

{10799: ['Elena', 'Goncharenko', '2024-01-26', 0, 'А кто ответственный за организацию концерта?'], 10801: ['Natalia', 'Sergeeva', '2024-01-26', 0, 'мою маму блокадницу, 90 лет в этом году будет, никто никуда не приглашал']}


#### Загрузка постов и вывод данных в формате DataFrame

Функция парсинга постов

In [None]:
class PostGetter:
    def __init__():
        pass

    API_VERISON = 5.131
    OFFSET_STEP = 100
    OFFSET_LIMIT = 700
    COUNT_ITEMS = 100
    SLEEP_TIME = 0.5
    TIMEOUT_LIMIT = 15

    def get_group_post_ids(owner_id, your_token) -> List[int]:
        offset = 0
        post_ids = []

        while offset < PostGetter.OFFSET_LIMIT:
            res = requests.get(
                "https://api.vk.com/method/wall.get",
                params={
                    "access_token": your_token,
                    "v": PostGetter.API_VERISON,
                    "owner_id": owner_id,
                    "count": PostGetter.COUNT_ITEMS,
                    "offset": offset,
                },
            ).json()["response"]

            post_ids_new = [k["id"] for k in res["items"]]
            post_ids += post_ids_new
            offset += PostGetter.OFFSET_STEP

        return post_ids

    def unix_to_date(ts):  # перевод даты из unix-формата в привычный
        return datetime.datetime.utcfromtimestamp(ts).strftime("%Y-%m-%d")

    def nes_params(post_id, all_comments):  # работа с необходимыми параметрами.
        nes_dict = {}
        profiles = all_comments["profiles"]
        comments = all_comments["items"]
        first_string = ["NONE", "NONE", "NONE"]
        for comment in comments:
            if len(comment["text"]) > 3:
                second_string = [
                    PostGetter.unix_to_date(comment["date"]),
                    comment["text"],
                ]
                for profile in profiles:
                    if comment["from_id"] == profile["id"]:
                        first_string = [
                            profile["first_name"],
                            profile["last_name"],
                        ]
                nes_dict[comment["id"]] = (
                    first_string + second_string + [post_id]
                )
        return nes_dict

    def get_Comments(
        post_id, owner_id, token
    ):  # парсинг комментариев по id постов
        temp_dict = {}
        offset = 0
        while offset < PostGetter.OFFSET_LIMIT:
            response = requests.get(
                "https://api.vk.com/method/wall.getComments",
                params={
                    "access_token": token,
                    "v": PostGetter.API_VERISON,
                    "owner_id": owner_id,
                    "post_id": post_id,
                    "count": PostGetter.COUNT_ITEMS,
                    "offset": offset,
                    "extended": 1,
                },
                timeout=PostGetter.TIMEOUT_LIMIT,
            )
            data_comments = response.json()["response"]
            comments_dict = PostGetter.nes_params(post_id, data_comments)
            temp_dict.update(comments_dict)

            offset += PostGetter.OFFSET_STEP
            time.sleep(PostGetter.SLEEP_TIME)
        return temp_dict

    def _to_df(nes_dict):  # перевод словаря в датафрейм
        df = pd.DataFrame.from_dict(
            nes_dict,
            orient="index",
        )
        if 5 in df.columns:
            temp_df = df[df[0] == "NONE"]
            df = df.drop(temp_df.index)
            df = df.drop(columns=[5])
            temp_df = temp_df.drop(columns=[0])
            df.columns = ["name", "last_name", "date", "text", "post_id"]
            temp_df.columns = ["name", "last_name", "date", "text", "post_id"]
            df = pd.concat([df, temp_df]).sort_values(by='post_id')
        else:
            df.columns = ["name", "last_name", "date", "text", "post_id"]
        df.text = df["text"].str.replace("\n", " ")
        return df

    def run(your_owner_id, your_token, limit_posts=None):
        nes_dict = {}
        post_ids = PostGetter.get_group_post_ids(your_owner_id, your_token)
        for post_id in tqdm(post_ids[:limit_posts]):
            comments_dict = PostGetter.get_Comments(post_id, your_owner_id, your_token)
            nes_dict.update(comments_dict)

        return PostGetter._to_df(nes_dict)

Получение данных в формате DataFrame

In [None]:
post_id = "10796"
owner_id = "-51988192"
token = '492f35ec492f35ec492f35ecdd4a3926724492f492f35ec2c17df616b6994a83242f520'

#Вызов метода
result_df = PostGetter.get_Comments(post_id, owner_id, token)

result_df

{10799: ['Elena',
  'Goncharenko',
  '2024-01-26',
  'А кто ответственный за организацию концерта?',
  '10796'],
 10801: ['Natalia',
  'Sergeeva',
  '2024-01-26',
  'мою маму блокадницу, 90 лет в этом году будет, никто никуда не приглашал',
  '10796']}