# Churn - Preparing

## Load Libraries and download dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  
DEBUG = False

import os
for d in ['src','data','output']: os.makedirs(d, exist_ok=True)

In [None]:
for filename in ['churn.csv','states.csv']:
    source = f"https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/01-Module_Introduction/20-Practical_01_-_Review_of_Pandas_-_Churn/files/{filename}"
    target = f"src/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename}")

---
## Load and Prepare the Data - churn

In [None]:
df = pd.read_csv("src/churn.csv")
display(df.shape)
df.head()

In [None]:
import pandas_profiling
pandas_profiling.ProfileReport(df)

#### Standardise column names

In [None]:
names = df.columns.tolist()
print("Original columns names:\n", names)

In [None]:
CORRECTIONS = {" ":"_", "'":"", "?":"", "CustServ":"Cust_Serv"}

def fixName(s):
    for a,b in CORRECTIONS.items():
        s = s.replace(a,b)
    return s

mapping = {c:fixName(c) for c in names}
mapping

In [None]:
df.rename(columns=mapping, inplace=True)

#### Drop user specific features (Phone)

In [None]:
df.drop(columns=["Phone"], inplace=True)

#### Drop highly correlated features

In [None]:
df.drop(columns=["Day_Charge", "Eve_Charge", "Night_Charge", "Intl_Charge"], inplace=True)

#### Standardise labels in target

In [None]:
df.Churn = df.Churn.map( {"False.":"no", "True.":"yes"} )

### Save

In [None]:
df.head()

In [None]:
df.to_csv("data/churn.csv", index=False)

## Load and Prepare Dataset - states

In [None]:
df = pd.read_csv('src/states.csv')
display(df.shape)
df.head()

### Data Cleaning

 * Columns names should be standardised (use title case)

#### Standardise columns

In [None]:
df.columns = [c.title() for c in  df.columns]
df.head()

### Save

In [None]:
df.to_csv("data/states.csv", index=False)