In [1]:
import pandas as pd
import os

In [2]:
# find all files
files_list = [file for file in os.listdir() if file.endswith('.csv') and file.startswith('tm_')]

In [3]:
# create dataFrame with subsid
subsid = pd.DataFrame()
for i in files_list:
    tmp = pd.read_csv(i, sep=';', parse_dates=[-1])
    subsid = pd.concat([subsid, tmp], ignore_index=True)

In [4]:
# prepare dataFrame to work 
subsid = subsid.dropna(subset=['SUBS_ID'])
# unify ID
subsid['SUBS_ID'] = subsid['SUBS_ID'].apply(lambda x: 'id' + x if not x.startswith('id') else x)

In [5]:
subsid

Unnamed: 0,SUBS_ID,FILIAL_ID,PROD_ID,ACT_DTTM
0,id4651830,1,1954,2020-03-20 14:59:00
1,id7646509,5,6431,2020-03-19 13:00:00
2,id7412683,4,3313,2020-03-22 17:25:00
3,id5416547,3,1743,2020-03-17 10:17:00
5,id8362218,7,9879,2020-05-03 11:42:00
6,id2185490,2,3210,2020-03-16 16:28:00
7,id5764122,3,1499,2020-03-18 15:44:00
8,id7642700,6,3020,2020-03-15 14:21:00
9,id1374509,2,5677,2020-03-17 11:48:00


In [6]:
# read activations dataFrame
path_to_activations = 'prod_activations_logs.csv'
activations = pd.read_csv(path_to_activations, sep=';')

In [7]:
# change date formats
activations.START_DTTM = pd.to_datetime(activations.START_DTTM, format='%d-%m-%Y %H:%M')
activations.END_DTTM = pd.to_datetime(activations.END_DTTM, format='%d-%m-%Y %H:%M')

In [8]:
# delete short connections
activations['difference'] = activations.END_DTTM - activations.START_DTTM
result = activations.loc[activations.difference > pd.Timedelta(5, 'm')]
result

Unnamed: 0,SUBS_ID,PROD_ID,START_DTTM,END_DTTM,difference
1,id4651830,1954,2020-03-20 14:59:00,2020-12-01 00:00:00,255 days 09:01:00
2,id7461794,3310,2020-03-20 17:25:00,2020-12-01 00:00:00,255 days 06:35:00
3,id5416547,1743,2020-03-17 10:17:00,2020-03-25 11:00:00,8 days 00:43:00
3,id8641743,2752,2020-03-20 15:44:00,2020-04-21 15:44:00,32 days 00:00:00
2,id2185490,3210,2020-03-16 16:28:00,2020-12-01 00:00:00,259 days 07:32:00
6,id7642700,3020,2020-03-15 14:21:00,2020-03-15 23:42:00,0 days 09:21:00
2,id8741631,5677,2020-03-19 12:28:00,2020-12-01 00:00:00,256 days 11:32:00


In [9]:
# save correct connection to the file
# correct_connections_to_write = subsid.merge(activations, how='inner', on='SUBS_ID')[['ACT_DTTM', 'FILIAL_ID', 'difference']]
correct_connections = subsid.merge(result, how='inner', on='SUBS_ID')
correct_connections[['ACT_DTTM', 'FILIAL_ID', 'difference']].to_csv('correct_connections', index=False, sep=';')

In [10]:
# print SUBS_ID for correct connections
print(*list(correct_connections.SUBS_ID.sort_values()), sep=', ')

id2185490, id4651830, id5416547, id7642700
