### Testing code for processing.py creation

#### Data standardization and encoding

In [2]:
import argparse
import os
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer

In [3]:
bucket = 'ml-ai-competency'
df_train_path = os.path.join(f"s3://{bucket}/sagemaker/processed_data/", "df_train.csv")
# save dataset to s3 bucket 
df_train = pd.read_csv(df_train_path)

In [4]:
cat_cols = list(df_train.select_dtypes(include='object').columns)
num_cols = list(df_train.select_dtypes(include='float64').columns)
int_cols = list(df_train.select_dtypes(include='int64').columns)

In [5]:
print(f"cat_cols:{len(cat_cols)}, num_cols:{len(num_cols)}, int_cols:{len(int_cols)}")

cat_cols:26, num_cols:7, int_cols:9


In [6]:
def print_shape(df):
    negative_examples, positive_examples = np.bincount(df["target"])
    print(
        "Data shape: {}, {} positive examples, {} negative examples".format(
            df.shape, positive_examples, negative_examples
        )
    )

In [7]:
print_shape(df_train)

Data shape: (7043, 42), 1869 positive examples, 5174 negative examples


In [8]:
import argparse
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

In [9]:
parser = argparse.ArgumentParser()
parser.add_argument("--train-test-split-ratio", type=float, default=0.2)
args, _ = parser.parse_known_args()

print("Received arguments {}".format(args))

Received arguments Namespace(train_test_split_ratio=0.2)


In [10]:
negative_examples, positive_examples = np.bincount(df_train["target"])
print(
        "Data after cleaning: {}, {} positive examples, {} negative examples".format(
            df_train.shape, positive_examples, negative_examples
        )
    )

Data after cleaning: (7043, 42), 1869 positive examples, 5174 negative examples


In [11]:
df_train.isna().any(axis=1)

0        True
1        True
2        True
3        True
4        True
        ...  
7038    False
7039     True
7040     True
7041     True
7042     True
Length: 7043, dtype: bool

In [12]:
# Create transformers for categorical and numerical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
numerical_transformer = StandardScaler()

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_cols),
        ('num', numerical_transformer, num_cols)
    ]
)

In [13]:
print("Running preprocessing and feature engineering transformations")
df_train_ = preprocessor.fit_transform(df_train.drop("target", axis=1))
print("Train data shape after preprocessing: {}".format(df_train_.shape))

Running preprocessing and feature engineering transformations
Train data shape after preprocessing: (7043, 85)




In [14]:
feat_names_out = preprocessor.get_feature_names_out()
feat_names_out

array(['cat__Senior Citizen_No', 'cat__Senior Citizen_Yes',
       'cat__Partner_No', 'cat__Partner_Yes', 'cat__Dependents_No',
       'cat__Dependents_Yes', 'cat__Phone Service_No',
       'cat__Phone Service_Yes', 'cat__Multiple Lines_No',
       'cat__Multiple Lines_No phone service', 'cat__Multiple Lines_Yes',
       'cat__Internet Service_DSL', 'cat__Internet Service_Fiber optic',
       'cat__Internet Service_No', 'cat__Online Security_No',
       'cat__Online Security_No internet service',
       'cat__Online Security_Yes', 'cat__Online Backup_No',
       'cat__Online Backup_No internet service', 'cat__Online Backup_Yes',
       'cat__Device Protection_No',
       'cat__Device Protection_No internet service',
       'cat__Device Protection_Yes', 'cat__Tech Support_No',
       'cat__Tech Support_No internet service', 'cat__Tech Support_Yes',
       'cat__Streaming TV_No', 'cat__Streaming TV_No internet service',
       'cat__Streaming TV_Yes', 'cat__Streaming Movies_No',
       '

In [15]:
pd.DataFrame(feat_names_out).to_csv('feat_names_out.csv')

In [16]:
df_train_

array([[ 1.        ,  0.        ,  0.        , ..., -1.48630342,
        -1.04870083, -1.15788894],
       [ 1.        ,  0.        ,  1.        , ..., -0.37994091,
        -0.1967954 , -0.30565771],
       [ 1.        ,  0.        ,  1.        , ..., -0.80850263,
        -1.0140621 , -0.35530461],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -1.48630342,
        -0.94770261, -1.10337435],
       [ 0.        ,  1.        ,  0.        , ..., -0.79943939,
        -0.93030995,  0.39261453],
       [ 1.        ,  0.        ,  1.        , ...,  0.5179671 ,
         2.043099  ,  1.28894288]])

#### Data Splitting

We decided to randomly split data from last step into train/validation/test to predict churn per user. 

In [17]:
split_ratio = args.train_test_split_ratio
print("Splitting data into train and test sets with ratio {}".format(split_ratio))
X_train, X_test, y_train, y_test = train_test_split(df_train_, 
                                                        df_train["target"], 
                                                        test_size=split_ratio, 
                                                        random_state=2023)
    
print("Splitting data into train and validation sets with ratio {}".format(split_ratio))
X_train, X_valid, y_train, y_valid = train_test_split(X_train, 
                                                        y_train, 
                                                        test_size=split_ratio, 
                                                        random_state=2023)

Splitting data into train and test sets with ratio 0.2
Splitting data into train and validation sets with ratio 0.2


In [18]:
y_train.index

Index([7004,  847, 3673,  772, 4642, 5174, 6848, 1238,  445, 3211,
       ...
       6290, 4486,  557, 2332, 3738, 5205, 2139, 6920,  686, 6710],
      dtype='int64', length=4507)

In [19]:
train = pd.concat([pd.Series(y_train, index=y_train.index,
                             name='target', dtype=int), pd.DataFrame(X_train, index=y_train.index)], axis=1)

In [20]:
train

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,75,76,77,78,79,80,81,82,83,84
7004,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.008322,0.621148,-0.248313,-0.583651,-1.051915,0.318255,1.071352
847,0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,-1.050644,-0.811145,-0.248313,-0.884833,-1.486303,-0.903884,-0.918936
3673,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.229341,0.878029,-0.248313,0.047343,-0.269887,0.707941,1.224149
772,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,-1.491021,-0.998584,-0.248313,-0.826352,1.718846,-1.034893,-1.487537
4642,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.333633,1.119226,-0.248313,2.222361,1.133620,1.541473,0.309992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5205,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.878702,0.484865,-0.248313,-0.372182,-0.726933,0.272925,0.881736
2139,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.069407,-0.805256,-0.248313,-0.486628,1.631451,-0.781556,0.021317
6920,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,-1.497668,-0.782667,-0.248313,0.144650,0.603420,-0.577144,-1.521894
686,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,-1.334812,-0.448644,5.814668,1.185047,0.738074,-0.022174,-1.324158


In [21]:
pd.DataFrame(X_train, index=y_train.index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,75,76,77,78,79,80,81,82,83,84
7004,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.008322,0.621148,-0.248313,-0.583651,-1.051915,0.318255,1.071352
847,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,-1.050644,-0.811145,-0.248313,-0.884833,-1.486303,-0.903884,-0.918936
3673,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.229341,0.878029,-0.248313,0.047343,-0.269887,0.707941,1.224149
772,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,-1.491021,-0.998584,-0.248313,-0.826352,1.718846,-1.034893,-1.487537
4642,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.333633,1.119226,-0.248313,2.222361,1.133620,1.541473,0.309992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5205,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.878702,0.484865,-0.248313,-0.372182,-0.726933,0.272925,0.881736
2139,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.069407,-0.805256,-0.248313,-0.486628,1.631451,-0.781556,0.021317
6920,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,-1.497668,-0.782667,-0.248313,0.144650,0.603420,-0.577144,-1.521894
686,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,-1.334812,-0.448644,5.814668,1.185047,0.738074,-0.022174,-1.324158


In [22]:
train = pd.concat([pd.Series(y_train, index=y_train.index,
                             name='target', dtype=int), pd.DataFrame(X_train, index=y_train.index)], axis=1)
validation = pd.concat([pd.Series(y_valid, index=y_valid.index,
                            name='target', dtype=int), pd.DataFrame(X_valid, index=y_valid.index)], axis=1)
test = pd.concat([pd.Series(y_test, index=y_test.index,
                            name='target', dtype=int), pd.DataFrame(X_test, index=y_test.index)], axis=1)

In [23]:
train.shape

(4507, 86)

In [24]:
negative_examples, positive_examples = np.bincount(train["target"])
print(
        "Train data after spliting: {}, {} positive examples, {} negative examples, {} churn rate".format(
            train.shape, positive_examples, negative_examples, round(100*positive_examples/(positive_examples+negative_examples),2) 
        )
    )

negative_examples, positive_examples = np.bincount(validation["target"])
print(
        "Validation data after spliting: {}, {} positive examples, {} negative examples, {} churn rate".format(
            validation.shape, positive_examples, negative_examples, round(100*positive_examples/(positive_examples+negative_examples),2) 
        )
    )
    
negative_examples, positive_examples = np.bincount(test["target"])
print(
        "Test data after spliting: {}, {} positive examples, {} negative examples, {} churn rate".format(
            test.shape, positive_examples, negative_examples, round(100*positive_examples/(positive_examples+negative_examples),2) 
        )
    )

Train data after spliting: (4507, 86), 1165 positive examples, 3342 negative examples, 25.85 churn rate
Validation data after spliting: (1127, 86), 323 positive examples, 804 negative examples, 28.66 churn rate
Test data after spliting: (1409, 86), 381 positive examples, 1028 negative examples, 27.04 churn rate
