In [76]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [75]:
import requests
import math
import pandas as pd
import os
from tqdm import tqdm

In [84]:
!mkdir data

# Scraping intents

In [85]:
def get_intents_df():
    intents_url = "https://c.selsup-team.ru/api/bot/findIntent"
    total_count = 3815
    limit = 500
    headers = {
        "Cookie": os.getenv("SELSUP_COOKIE")
    }
    
    df = pd.DataFrame(columns=['id', 'text', 'pattern', 'intentId', 'groupId', 'answer'])
    
    for page in tqdm(range(math.ceil(total_count / limit))):
        res = requests.get(
            intents_url, 
            params={"limit": limit, "count": True, "page": page+1}, 
            headers=headers
        )
        rows = res.json()["rows"]
        df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)
    return df

intents_df = get_intents_df()

100%|██████████| 8/8 [00:01<00:00,  4.53it/s]


In [86]:
intents_df.to_csv("data/intents.csv", index=False)

In [90]:
intents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3815 entries, 0 to 3814
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3815 non-null   object
 1   text      3815 non-null   object
 2   pattern   0 non-null      object
 3   intentId  3815 non-null   object
 4   groupId   0 non-null      object
 5   answer    3815 non-null   object
dtypes: object(6)
memory usage: 179.0+ KB


# Scraping messages

In [87]:
def remove_now_answered_column(df):
    return df.drop(['nowAnswered'], axis=1)

def get_messages_df():
    messages_url = "https://c.selsup-team.ru/api/bot/findMessage"
    total_count = 35574
    limit = 500
    headers = {
        "Cookie": os.getenv("SELSUP_COOKIE")
    }
    
    df = pd.DataFrame(columns=['answer', 'answered', 'chatId', 
                               'clientId', 'messageId', "success", 
                               "text"])
    
    for page in tqdm(range(math.ceil(total_count / limit))):
        res = requests.get(
            messages_url, 
            params={"limit": limit, "count": True, "page": page + 1, 
                    "sortBy": "TIMESTAMP", "ascending": False}, 
            headers=headers
        )
        rows = res.json()["rows"]
        df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True) 
    return df.pipe(remove_now_answered_column)

messages_df = get_messages_df()

100%|██████████| 72/72 [00:15<00:00,  4.70it/s]


In [88]:
messages_df.to_csv("data/messages.csv", index=False)

In [91]:
messages_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35576 entries, 0 to 35575
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   answer     8414 non-null   object
 1   answered   35576 non-null  object
 2   chatId     35576 non-null  object
 3   clientId   35576 non-null  object
 4   messageId  35576 non-null  object
 5   success    35576 non-null  object
 6   text       35576 non-null  object
dtypes: object(7)
memory usage: 1.9+ MB
