# US Churn &mdash; Import

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
sns.set_style("darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "US_Churn"

ROOT = "./"
COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
    

DEBUG = False
SEED = 1612

In [None]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")
        d = "/content/gdrive/MyDrive/datasets"
        if not os.path.isdir(d): os.makedirs(d)
        if not os.path.isdir(ROOT): os.makedirs(ROOT)

def makedirs(d):
    if COLAB:
        if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
    else:
        if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Datasets

In [None]:
BASE_URL = "https://SETU-DataMining2.github.io/live/resources/us_churn"

for filename in ['churn.csv',]:
    source = f"{BASE_URL}/{filename}"
    target = f"{ROOT}/orig/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename}")

In [None]:
df = pd.read_csv(f"{ROOT}/orig/churn.csv")
print(df.shape)
df.head()

In [None]:
df.info()

Issues are:

 * Poor column names - spaces and punctation
 * Inconsistent labels for boolean columns - convert to categorical also
 * Need to encode target column  - convert to categorical also
 * `Area_Code` should be categorical (3 levels)
 * `State` should be categorical (51 levels) <- what should I do here? wait till EDA
 * Unique identifier column `Phone`

In [None]:
# Poor column names - spaces and punctuation
df.columns = [c.replace(" ", "_").replace("'", "").replace("?", "") for c in df.columns]

In [None]:
# Inconsistent labels for boolean columns
for c in [c for c in df.columns if "Plan" in c]:
    if df[c].dtype =="object":
        df[c] = df[c].map( {"no":0, "yes":1} )
        df[c] = pd.Categorical(df[c])

In [None]:
# Need to encode target column
if df.Churn.dtype == "object":
    df.Churn = df.Churn.map( {"False.":0, "True.":1} )
    df.Churn = pd.Categorical(df.Churn)

In [None]:
# Encode Area_Code column
if 0 not in df.Area_Code.unique():
    df.Area_Code = df.Area_Code.map( {415:0, 510:1,408:2 } )
    df.Area_Code = pd.Categorical(df.Area_Code)

In [None]:
if df.State.dtype == "object":
    df.State = pd.Categorical(df.State)
df.State.nunique()

In [None]:
# Remove unwanted columns
for c in ["Phone"]: 
    if c in df.columns: df.drop(c, axis='columns',inplace=True)

## Save dataset

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.to_feather(f"{ROOT}/data/churn.feather")