# Phase 5 - Improve Performance Using Feature Engineering

In this notebook, I am going to predict churn using feature selection. Selecting features will be an iterative process; I will train an SVM model, list features by importance, and remove the least important feature. Then, I will train the model using n-1 features. This loop ends when we have only one feature remained. Adding features to a FIFO stack, along the way, gives us the list of features in ascending order.

## Import Required Packages

In [21]:
# Standard Python packages
from math import sqrt
import pickle

# Data packages
import pandas as pd
import numpy as np

# Visualization Packages
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

# Data preprocessing packages
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,\
roc_auc_score, roc_curve, auc,\
confusion_matrix, classification_report,\
ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay


In [2]:
data = pd.read_csv("Data/Coursera.csv")
data.shape

  data = pd.read_csv("Data/Coursera.csv")


(413955, 37)

In [3]:
data.dtypes.iloc[[6,7,25,31]]

is_professional_certificate                object
is_gateway_certificate                     object
is_subscription_started_with_free_trial    object
is_active_capstone_during_pay_period       object
dtype: object

In [4]:
for i in data.dtypes.iloc[[6,7,25,31]].index:
    print(f"Unique values in column {i}: {data[i].unique()}")


Unique values in column is_professional_certificate: [True False nan]
Unique values in column is_gateway_certificate: [True False nan]
Unique values in column is_subscription_started_with_free_trial: [False True nan]
Unique values in column is_active_capstone_during_pay_period: [False True nan]


In [5]:
# Remove duplicates (if any)
data.duplicated().sum()


0

In [6]:
# Remove nans
data = data.dropna()
# data.isna().sum()
# df0.info()
data.shape

(413953, 37)

In [7]:
# Define preprocessing steps
def preprocess_data(df):    

    # drop null rows
    df = df.dropna()

    # remove highly correlated variables
    df = df.drop([ 'days_til_next_payment_due', # correlated with 'days_since_last_payment'
                   'specialization_id',
                   'subscription_id', 'observation_dt'
                   ], axis=1)

    # map learner country groups by geography
    # create a dictionary to map learner country groups
    country_map = {'Northern Europe': 'Europe',
                    'Australia and New Zealand': 'ANZ',
                    'United States': 'NorthAm',
                    'India': 'Asia',
                    'East Asia': 'Asia',
                        'Eastern Europe': 'Europe',
                        'Southern Europe': 'Europe',
                        'Southeast Asia': 'Asia', 
                        'Middle East': 'MENA',
                        'Africa and developing Middle East': 'MENA',
                        'China': 'Asia', 
                        'Canada': 'NorthAm',
                        'Non-Brazil Latin America': 'LatAm', 
                        'Brazil': 'LatAm',
                        'Russia and neighbors': 'Europe'}

    # map the learner country groups
    df['learner_country_group'] = df['learner_country_group'].map(country_map)

    # group 'other' gender into 'unknown'
    df['learner_gender'] = df['learner_gender'].replace('other', 'unknown')
    # df['learner_gender'] = df['learner_gender'].cat.rename_categories(
    #     {'other': 'unknown','female':'female', 'male':'male', 'unknown':'unknown'})
    

    # -----------------------------------------------------------------------------------------------------------------------------------------------

    # create new feature, 'pct_learner_paid_active'
    try:
        df['pct_learner_paid_active'] = df['learner_cnt_other_courses_paid_active'] / df['learner_cnt_other_courses_active']
    except ZeroDivisionError:
        df['pct_learner_paid_active']=0

    # create new feature, 'pct_learner_paid_items_completed'
    try:
        df['pct_learner_paid_items_completed'] = df['learner_cnt_other_courses_paid_items_completed'] / df['learner_cnt_other_courses_items_completed']
    except ZeroDivisionError:
        df['pct_learner_paid_items_completed']=0

    # create new feature, 'revenue_per_transaction'
    try:
        df['revenue_per_transaction'] = df['learner_other_revenue'] / df['learner_cnt_other_transactions_past']
    except ZeroDivisionError:
        df['revenue_per_transaction']=0

    # -----------------------------------------------------------------------------------------------------------------------------------------------

    # create new feature, 'pct_enrollments_active_before_payment_period'
    try:
        df['pct_enrollments_active_before_payment_period'] = df['cnt_enrollments_active_before_payment_period'] / df['cnt_enrollments_started_before_payment_period']
    except ZeroDivisionError:
        df['pct_enrollments_active_before_payment_period']=0
    
    # create new feature, 'pct_enrollments_completed_before_payment_period'
    try:
        df['pct_enrollments_completed_before_payment_period'] = df['cnt_enrollments_completed_before_payment_period'] / df['cnt_enrollments_started_before_payment_period']
    except ZeroDivisionError:
        df['pct_enrollments_completed_before_payment_period']=0

    # create new feature, 'pct graded items completed before payment period'
    try:
        df['pct_graded_items_completed_before_payment_period'] = df['cnt_graded_items_completed_before_payment_period'] / df['cnt_items_completed_before_payment_period']
    except ZeroDivisionError:
        df['pct_graded_items_completed_before_payment_period']=0

    # -----------------------------------------------------------------------------------------------------------------------------------------------

    # create new feature, 'pct_enrollments_active_during_payment_period'
    try:
        df['pct_enrollments_active_during_payment_period'] = df['cnt_enrollments_active_during_payment_period'] / df['cnt_enrollments_started_during_payment_period']
    except ZeroDivisionError:
        df['pct_enrollments_active_during_payment_period']=0
        
    # create new feature, 'pct_enrollments_completed_during_payment_period'
    try:
        df['pct_enrollments_completed_during_payment_period'] = df['cnt_enrollments_completed_during_payment_period'] / df['cnt_enrollments_started_during_payment_period']
    except ZeroDivisionError:
        df['pct_enrollments_completed_during_payment_period']=0

    # create new feature, 'pct_graded_items_completed_during_payment_period'
    try:
        df['pct_graded_items_completed_during_payment_period'] = df['cnt_graded_items_completed_during_payment_period'] / df['cnt_items_completed_during_payment_period']
    except ZeroDivisionError:
        df['pct_graded_items_completed_during_payment_period']=0

    # -----------------------------------------------------------------------------------------------------------------------------------------------

    # create new feature, 'hrs_per_day_active_before_payment_period'
    try:
        df['hrs_per_day_active_before_payment_period'] = df['sum_hours_learning_before_payment_period'] / df['cnt_days_active_before_payment_period']
    except ZeroDivisionError:
        df['hrs_per_day_active_before_payment_period']=0

    # create new feature, 'hrs_per_day_active_during_payment_period'
    try:
        df['hrs_per_day_active_during_payment_period'] = df['sum_hours_learning_during_payment_period'] / df['cnt_days_active_during_payment_period']   
    except ZeroDivisionError:
        df['hrs_per_day_active_during_payment_period']=0

    # -----------------------------------------------------------------------------------------------------------------------------------------------

    # drop columns that are no longer needed
    df = df.drop(['learner_cnt_other_courses_active',
                  'learner_cnt_other_courses_paid_active',
                  'learner_cnt_other_courses_items_completed',
                  'learner_cnt_other_courses_paid_items_completed',
                  'learner_cnt_other_transactions_past', 
                  'learner_other_revenue',
                 'cnt_enrollments_started_before_payment_period',
                 'cnt_enrollments_completed_before_payment_period',
                 'cnt_enrollments_active_before_payment_period',
                 'cnt_items_completed_before_payment_period',
                 'cnt_graded_items_completed_before_payment_period',
                 'cnt_enrollments_started_during_payment_period',
                 'cnt_enrollments_completed_during_payment_period',
                 'cnt_enrollments_active_during_payment_period',
                 'cnt_items_completed_during_payment_period',
                 'cnt_graded_items_completed_during_payment_period',
                 'sum_hours_learning_before_payment_period',
                 'sum_hours_learning_during_payment_period',
                 'cnt_days_active_before_payment_period',
                 'cnt_days_active_during_payment_period',
    ], axis=1)

    
    # define the columns to convert to boolean
    cols_to_convert = ['is_professional_certificate', 'is_gateway_certificate', 
                       'is_subscription_started_with_free_trial', 'is_active_capstone_during_pay_period']

    # write a function to convert columns into boolean
    def convert_to_boolean(df, col):
        df[col] = df[col].apply(lambda x: 1 if x > 0 else 0)

    # convert columns to boolean
    for col in cols_to_convert:
        convert_to_boolean(df, col)

    # map subscription period order of more than 4 into 4
    df['subscription_period_order'] = df['subscription_period_order'].apply(lambda x: 4 if x > 4 else x)

    # convert subscription period order to categorical
    df['subscription_period_order'] = df['subscription_period_order'].astype('category')

    # get categorical columns
    cat_cols = ['specialization_domain', 'subscription_period_order', 'learner_country_group', 'learner_gender']

    # get dummies for categorical columns
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

    # fill null values with 0
    df = df.fillna(0)

    # replace inf with 0
    df = df.replace([np.inf, -np.inf], 0)

    # return the preprocessed dataframe
    return df

In [8]:
df = preprocess_data(data)
df.shape

(413953, 40)

In [9]:
import pickle
with open('Pickles//data.pickle', 'wb') as file:
    pickle.dump(df, file) 
print('Done!')

Done!


## Sample the Data
Since our dataset is relatively large, it takes very long to train a SVM model. So, I sample a smaller chunck of the data to select features.

In [27]:
n = 10000
df = df.sample(n, ignore_index=True)
df.shape

(10000, 40)

In [28]:
X = df.drop(columns=['is_retained'])
y = df['is_retained']

# # Split into train and validate sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
                                                  

In [84]:

clf = Pipeline([
 ("scaler", StandardScaler()),
 ("svm_clf", SVC(kernel="poly", degree=4, coef0=2, C=0.085))
    # ("svm_clf", SVC(kernel="rbf", gamma=0.5, C=20)) 
])

clf.fit(X_train, y_train)

print('Done SVC kernel poly')

yt = clf.predict(X_train)
print("Train Precision:", precision_score(yt, y_train))
y_pred = clf.predict(X_test)
print("Test Precision:", precision_score(y_test, y_pred))


Done SVC kernel poly
Train Precision: 0.8081717451523546
Test Precision: 0.6825817860300619


In [76]:

clf = Pipeline([
 ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", gamma=.0035, C=2)) 
])

clf.fit(X_train, y_train)

print('Done SVC kernel rbf')

yt = clf.predict(X_train)
print("Train Precision:", precision_score(yt, y_train))
y_pred = clf.predict(X_test)
print("Test Precision:", precision_score(y_test, y_pred))


Done SVC kernel rbf
Train Precision: 0.734533702677747
Test Precision: 0.7012522361359571


In [90]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
 DecisionTreeClassifier(), n_estimators=1000,
 max_samples=5000, bootstrap=False, n_jobs=-1)
bag_clf.fit(X_train, y_train)

yt = bag_clf.predict(X_train)
print("Train Precision:", precision_score(yt, y_train))
y_pred = bag_clf.predict(X_test)
print("Test Precision:", precision_score(y_test, y_pred))


Train Precision: 0.9997691597414589
Test Precision: 0.6942909760589319


## Model Building