This notebook is developed using the `Python 3 (Data Science)` kernel on an `ml.t3.medium` instance.

In [None]:
!pip install -q awswrangler

In [None]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
import awswrangler as wr

sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = 'sagemaker-studio-book/chapter03'

In [None]:
!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/synthetic/churn.txt ./

In [None]:
df=pd.read_csv('./churn.txt')
df['CustomerID']=df.index
df.head()

In [None]:
columns_with_nan = ['Account Length', 'CustServ Calls']

In [None]:
df2 = df.copy()
df2[columns_with_nan] = df2[columns_with_nan].mask(np.random.random(df[columns_with_nan].shape) < 5e-2)

In [None]:
df2.head()

In [None]:
customer_columns = ['CustomerID', 'State', 'Area Code', 'Phone']
account_columns = ['CustomerID', 'Account Length', "Int'l Plan", 'VMail Plan', 'Churn?']
utility_columns = ['CustomerID', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 
                   'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 
                   'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls']

In [None]:
databases = wr.catalog.databases()
print(databases)
db_name = 'telco_db'
if db_name not in databases.values:
    wr.catalog.create_database(db_name, description = 'Sample DB for telco churn dataset')
    print(wr.catalog.databases())
else:
    print(f"Database {db_name} already exists")

In [None]:
dfs = []
suffix = ['customer_info', 'account_info', 'utility']
for i, columns in enumerate([customer_columns, account_columns, utility_columns]):
    df_tmp = df2[columns]
    print(columns)
    df_tmp.head()
    dfs.append(df_tmp)
    fname = 'telco_churn_%s' % suffix[i]
    outputpath = f's3://{bucket}/{prefix}/data/{fname}'
    print(outputpath)
    if i > 1:
        wr.s3.to_csv(
            df=df_tmp,
            path=outputpath,
            dataset=True,
            database=db_name,  # Athena/Glue database
            table=fname,  # Athena/Glue table
            index=False,
            mode='overwrite')
    else:
        wr.s3.to_csv(
            df=df_tmp,
            path=f'{outputpath}.csv',
            index=False)