In [1]:
import os
import pandas as pd
import glob
from utils.utils import init_config

In [2]:
config_path = "config/config.json"
config = init_config(config_path)

In [3]:
DIALOGS_DATA_PATH = config['dialogs_data_folder']
DIALOGS_META_DATA_PATH = config['dialogs_list_folder']
MERGED_DATA_PATH = config['merged_data_path']

In [4]:
!ls ../data/

[34mdialogs[m[m      [34mdialogs_meta[m[m [34mgifs_folder[m[m  [34mmerged_data[m[m  [34mmp4_folder[m[m


In [5]:
dialogs_data_files = glob.glob(f"{DIALOGS_DATA_PATH}/*.csv")
dialogs_meta_data_files = glob.glob(f"{DIALOGS_META_DATA_PATH}/*.json")

## merging all dialogs_data (csv files) into one

In [6]:
df_array = []

for d in dialogs_data_files:
    try:
        local_df = pd.read_csv(d)
        local_df["dialog_id"] = os.path.basename(d).split(".")[0]
        df_array.append(local_df)
    except pd.errors.EmptyDataError:
        print(f"Warning: The file {d} is empty and will be skipped.")
    except Exception as e:
        print(f"Error: Could not process the file {d}. Reason: {e}")

if df_array:
    df = pd.concat(df_array, ignore_index=True)
    print("Dataframe successfully created from non-empty files.")
else:
    print("No valid files to process. The dataframe is empty.")




  local_df = pd.read_csv(d)


Dataframe successfully created from non-empty files.


In [7]:
df.shape

(8416543, 11)

In [8]:
if not os.path.isdir(MERGED_DATA_PATH):
    os.mkdir(MERGED_DATA_PATH)

if "Unnamed: 0" in df:
    df = df.drop(["Unnamed: 0"], axis=True)
    
df.to_csv(f"{MERGED_DATA_PATH}/dialogs_data_all.csv", index=False)

In [9]:
df.shape

(8416543, 10)

In [10]:
df.head(10)

Unnamed: 0,id,date,from_id,to_id,fwd_from,message,type,duration,reactions,dialog_id
0,170803,2023-06-16 08:39:46+00:00,,749006059,,думаешь у меня есть желание с тобой говорить? ...,text,,{},611662350
1,170802,2023-06-16 08:06:56+00:00,PeerUser(user_id=749006059),611662350,,хулі ігнориш,text,,{},611662350
2,170786,2023-06-15 18:45:18+00:00,PeerUser(user_id=749006059),611662350,,Ти вже писав нмт?,text,,{},611662350
3,170785,2023-06-15 18:45:13+00:00,PeerUser(user_id=749006059),611662350,,Доров,text,,{},611662350
4,114007,2022-11-19 10:26:19+00:00,PeerUser(user_id=749006059),611662350,,Ясно,text,,{},611662350
5,114004,2022-11-19 10:02:06+00:00,,749006059,,"а ""Іванов"" це одне з найчастіших імен",text,,{},611662350
6,114003,2022-11-19 10:01:48+00:00,,749006059,,"ти сам здається підписався просто як roman, ні?",text,,{},611662350
7,114002,2022-11-19 10:01:33+00:00,,749006059,,,text,,{},611662350
8,114001,2022-11-19 10:01:26+00:00,,749006059,,бо трохи не полюбляю виставляти ім'я в інтернеті,text,,{},611662350
9,114000,2022-11-19 09:57:34+00:00,PeerUser(user_id=749006059),611662350,,"Чому ти підписаний ""Іванов""? Якщо не секрет",text,,{},611662350


In [11]:
min(df["date"]),max(df["date"])

('2017-05-29 04:52:49+00:00', '2024-11-26 10:40:16+00:00')

In [12]:
df.groupby(["type"])["type"].count()

type
photo       776953
sticker     175799
text       7017409
video       401153
voice        45229
Name: type, dtype: int64

In [13]:
df.groupby(["type"])["duration"].sum()

type
photo      0.000000e+00
sticker    0.000000e+00
text       0.000000e+00
video      1.294413e+07
voice      2.548173e+09
Name: duration, dtype: float64

## merging all dialogs_meta_data (json files) into one

In [14]:
df_array = []

for d in dialogs_meta_data_files:
    local_df = pd.read_json(d)
    local_df = local_df.rename({'id': 'dialog_id'}, axis=1)
    df_array.append(local_df)
    
df_meta = pd.concat(df_array, ignore_index=True)

In [15]:
df_meta.to_csv(f"{MERGED_DATA_PATH}/dialogs_users_all.csv", index=False)

In [16]:
df_meta.shape

(27379, 4)

In [17]:
df_meta.head(10)

Unnamed: 0,dialog_id,name,type,users
0,1032732069,StikStok Bot,Private dialog,"{'user_id': 1032732069, 'first_name': 'StikSto..."
1,810142558,Masha,Private dialog,"{'user_id': 810142558, 'first_name': 'Masha', ..."
2,-1001419124412,UPML_HUB,Group,"{'user_id': 749006059, 'first_name': 'Roman', ..."
3,-1001419124412,UPML_HUB,Group,"{'user_id': 1008861710, 'first_name': 'Антон',..."
4,942619663,diana,Private dialog,"{'user_id': 942619663, 'first_name': 'diana', ..."
5,-1001592132299,Дупа кабана,Group,"{'user_id': 749006059, 'first_name': 'Roman', ..."
6,-1001592132299,Дупа кабана,Group,"{'user_id': 572990632, 'first_name': 'Лис', 'l..."
7,-1001592132299,Дупа кабана,Group,"{'user_id': 356833338, 'first_name': 'Vladenou..."
8,-1001592132299,Дупа кабана,Group,"{'user_id': 1349326050, 'first_name': 'CaLIJKo..."
9,-1001592132299,Дупа кабана,Group,"{'user_id': 292717879, 'first_name': 'Oksana',..."


In [18]:
df_meta.groupby(["type"])["type"].count()

type
Channel             129
Group             26959
Private dialog      291
Name: type, dtype: int64