In [1]:
def telco_churn(df):
    '''This function takes in the telco dataframe specifically and drops the 
    listed columns, creates dummy variables for the selected columns, and renames
    the dummy variable columns to make them easier to read'''
    
    drop_cols = ['Unnamed: 0', 'payment_type_id', 'contract_type_id', 'gender', 
                 'senior_citizen', 'partner', 'dependents', 'phone_service', 'multiple_lines',
                 'streaming_tv', 'streaming_movies', 'contract_type', 'payment_type', 
                 'internet_service_type_id', 'paperless_billing']
    
    df.drop(columns = drop_cols, inplace = True)
    
    dummies = pd.get_dummies(df[['online_security','online_backup', 'device_protection', 
                        'tech_support', 'churn', 'internet_service_type']], drop_first = True)

    df = pd.concat([df, dummies], axis = 1)
    
    df.drop(columns = ['online_security','online_backup', 'device_protection', 
                        'tech_support', 'churn', 'internet_service_type'], inplace = True)
    
    df.rename(columns = {'online_security_No internet service': 'online_security_NA', 
                         'online_backup_No internet service': 'online_backup_NA',
                         'device_protection_No internet service': 'device_prot_NA',
                         'tech_support_No internet service': 'tech_support_NA', 
                         'internet_service_type_Fiber optic': 'has_fiber',
                         'internet_service_type_None': 'internet_service_type_NA'}, inplace = True)
    
    return df



In [2]:
def cleanup(df):
    '''The purpose of this function is to cleanup the total_charges column by first removing
    any whitespace using the strip function, then identifying any non-values (not nulls) and 
    reassigning the dataframe to the same dataframe without the 11 rows that contain the non-values,
    then changing the datatype from an object to a float to make it readable for the model, 
    and finally making all of the columns lower case for uniformity'''
    
    df.total_charges = df.total_charges.str.strip()
    
    df = df[df.total_charges != ""]
    
    df.total_charges = df.total_charges.astype(float)
    
    df.columns = df.columns.str.lower()
    
    return df

In [3]:
def log_reg(df):
    X_train_LR = train.drop(columns=['churn_yes'])
    y_train_LR = train.churn_yes

    X_validate_LR = validate.drop(columns=['churn_yes'])
    y_validate_LR = validate.churn_yes

    X_test_LR = test.drop(columns=['churn_yes'])
    y_test_LR = test.churn_yes

    # from sklearn.linear_model import LogisticRegression
    logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')

    logit.fit(X_train_LR, y_train_LR)

    print('Coefficient: \n', logit.coef_)
    print('Intercept: \n', logit.intercept_)

    y_pred = logit.predict(X_train_LR)

    y_pred_proba = logit.predict_proba(X_train_LR)

    print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train_LR, y_train_LR)))

    print(confusion_matrix(y_train_LR, y_pred))

    print(classification_report(y_train, y_pred))


In [4]:
def model_report():
    data = {'Model': ['Decision Tree', 'Random Forest', 'Logistic Regression'],
            'Accuracy': [79, 79, 29],
            'Precision': [81, 80, 99]}
    return pd.DataFrame(data)

In [5]:
def get_tree(X_train, X_validate, y_train, y_validate):
    '''get decision tree accuracy on train and validate data'''

    # create classifier object
    clf = DecisionTreeClassifier(max_depth=3, random_state=123)

    #fit model on training data
    clf = clf.fit(X_train, y_train)

    # print result
    print(f"Accuracy of Decision Tree on train data is {clf.score(X_train, y_train)}")
    print(f"Accuracy of Decision Tree on validate data is {clf.score(X_validate, y_validate)}")

In [None]:
def get_forest(X_train_rf, X_validate_rf, y_train_rf, y_validate_rf):
    '''get random forest accuracy on train and validate data'''

    # create model object and fit it to training data
    rf = RandomForestClassifier(max_depth=3, random_state=123)
    rf.fit(X_train_rf,y_train_rf)

    # print result
    print(f"Accuracy of Random Forest on train is {rf.score(X_train_rf, y_train_rf)}")
    print(f"Accuracy of Random Forest on validate is {rf.score(X_validate_rf, y_validate_rf)}")


In [None]:
def get_reg(X_train_lr, X_validate_lr, y_train_lr, y_validate_lr):
    '''get logistic regression accuracy on train and validate data'''

    # create model object and fit it to the training data
    logit = LogisticRegression(solver='liblinear')
    logit.fit(X_train_lr, y_train_lr)

    # print result
    print(f"Accuracy of Logistic Regression on train is {logit.score(X_train_lr, y_train_lr)}")
    print(f"Accuracy of Logistic Regression on validate is {logit.score(X_validate_lr, y_validate_lr)}")