## Script to fetch disconnected accounts data and append to file

In [1]:
# import libraries
import pandas as pd
import os

### Fetch and clean data from file

In [20]:
# create empty dataframe
disconnections_df = pd.DataFrame(columns = ['disconnectionDate', 'count'])

# read in all files in the disconnected folder
directory = r'../data/raw/disconnected'
for file in os.listdir(directory):
    if file.endswith('.csv'):
        # read in file
        df = pd.read_csv('{directory}/{file}'.format(directory=directory, file=file))
        
        # create column for disconnection date
        df['disconnectionDate'] = df['Date'].str.split(' ').str[0]
        
        # convert the 'Date' column to datetime format
        df['disconnectionDate'] = pd.to_datetime(df['disconnectionDate'])
        
        # change 'Date' column type to string
        df['disconnectionDate'] = df['disconnectionDate'].astype(str)

        # create column for disconnection time
        df['disconnectionTime'] = df['Date'].str.split(' ').str[0]

        # drop 'Date' column
        df = df.drop(columns=['Date'])
        
        # aggregate data by disconnection date
        df = df[['disconnectionDate', 'Address']].groupby('disconnectionDate').count().sort_values(by='disconnectionDate', ascending=True).reset_index()
        
        # rename address column
        df = df.rename(columns={"Address": "count"})
        
        # append data
        disconnections_df = disconnections_df.append(df)
    else:
        pass


## Clean data

In [21]:
# convert disconnections_df dates from string to datetime
disconnections_df['disconnectionDate'] = pd.to_datetime(disconnections_df['disconnectionDate'])

# fill in missing dates with '0' for the count
disconnections_df = disconnections_df.set_index('disconnectionDate').asfreq('D').reset_index().fillna(0)

# fill in missing dates (pre-4/26) with '0' for the count
dates = pd.date_range('2021-01-01', '2021-04-26')
dates_df = pd.DataFrame(dates, columns = ['disconnectionDate'])
dates_df['count'] = 0
disconnections_df = pd.concat([dates_df, disconnections_df]).reset_index(drop=True)

# convert dates back to string
disconnections_df.disconnectionDate = disconnections_df.disconnectionDate.astype(str)

In [22]:
# sort by month
disconnections_df['month'] = disconnections_df['disconnectionDate'].str.split('-').str[1] + "-01-2021"
disconnections_df['month'] = pd.to_datetime(disconnections_df['month'])
disconnections_df.month = disconnections_df.month.astype(str)
byMonth = disconnections_df.groupby('month').sum().sort_values(by='count', ascending=False).reset_index(drop=False)

### Export data

In [24]:
byMonth.to_csv('../data/clean/2021-draft-by-month.csv')
# byMonth.to_json(r'../data/clean/disconnections/2021-draft-by-month.json', orient="records", date_format='%Y-%m-%d')