In [81]:
import numpy as np
import pandas as pd

In [82]:
df = pd.read_csv("udel-churn-train.csv")
# overview of data
df.head()

Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,6890,7470-DYNOE,Male,0.0,No,No,53.0,Yes,No,DSL,...,Yes,No,No,Yes,One year,Yes,Electronic check,61.1,3357.9,No
1,353,0219-YTZUE,Male,0.0,Yes,Yes,4.0,Yes,Yes,Fiber optic,...,Yes,No,No,No,Month-to-month,Yes,Bank transfer (automatic),84.8,371.9,Yes
2,2712,5133-VRSAB,Male,0.0,No,No,8.0,No,No phone service,DSL,...,No,No,No,No,Month-to-month,No,Mailed check,29.35,216.45,No
3,6253,6128-DAFVY,Female,0.0,No,No,56.0,No,No phone service,DSL,...,No,No,Yes,Yes,Month-to-month,Yes,Credit card (automatic),44.85,2564.95,No
4,4080,6199-IPCAO,Female,0.0,Yes,Yes,29.0,Yes,Yes,No,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,26.1,692.55,No


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        5634 non-null   int64  
 1   customerID        5634 non-null   object 
 2   gender            5589 non-null   object 
 3   SeniorCitizen     5573 non-null   float64
 4   Partner           5594 non-null   object 
 5   Dependents        5594 non-null   object 
 6   tenure            5544 non-null   float64
 7   PhoneService      5550 non-null   object 
 8   MultipleLines     5580 non-null   object 
 9   InternetService   5486 non-null   object 
 10  OnlineSecurity    5564 non-null   object 
 11  OnlineBackup      5528 non-null   object 
 12  DeviceProtection  5564 non-null   object 
 13  TechSupport       5564 non-null   object 
 14  StreamingTV       5529 non-null   object 
 15  StreamingMovies   5578 non-null   object 
 16  Contract          5555 non-null   object 


In [84]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'], errors='coerce')

In [85]:
df_num = df.select_dtypes(exclude ='object') 
df_cat = df.select_dtypes(include ='object') 

num_features = df_num.columns.tolist()
cat_features = df_cat.columns.tolist()

num_features.remove('SeniorCitizen')
cat_features.append('SeniorCitizen')


num_features.remove('Unnamed: 0')
cat_features.remove('customerID')
cat_features.remove('Churn')

print(num_features)
print(cat_features)

['tenure', 'MonthlyCharges', 'TotalCharges']
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']


In [86]:
def find_outlier_iqr(X):
    # calculate IQR
    q1 = X.quantile(0.25)
    q3 = X.quantile(0.75)
    iqr = q3 - q1
    
    outlier_index = ((X < (q1 - 1.5 * iqr)) | (X > (q3 + 1.5 * iqr))).any(axis=1)
    
    return outlier_index

In [87]:
outlier_index = find_outlier_iqr(df[num_features])
len_df_before = len(df)
df = df[~outlier_index] # note the negation ~
len_df_after = len(df)
print(f'{len_df_before-len_df_after} outliers detected and removed!')

16 outliers detected and removed!


In [88]:
X = df.drop(['Churn'], axis=1)
y = df['Churn']


In [89]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [90]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Create the preprocessing pipeline for numerical features
num_pipeline = Pipeline(
    steps=[
        ('num_imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler()),
        ]
)

# Create the preprocessing pipelines for the categorical features
cat_pipeline = Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder()),
    ]
)

# Combine two pipelines to form the preprocessor
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_features),
        ('cat_pipeline', cat_pipeline, cat_features),
    ]
)

In [91]:
from sklearn.linear_model import LogisticRegression

log_reg_clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('log_reg', LogisticRegression(max_iter=1000)),
    ]
)


In [92]:
from sklearn.tree import DecisionTreeClassifier

tree_clf_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('tree_clf', DecisionTreeClassifier()),
    ]
)


In [93]:
from sklearn.model_selection import GridSearchCV


param_grid = [
    {
      
        'tree_clf__criterion': ['gini', 'entropy'], 
        'tree_clf__max_depth': [2, 3, 4, 5, 6, 7],
    }
]

# set up the grid search 
grid_search = GridSearchCV(tree_clf_model, param_grid, cv=10,
                          scoring='accuracy',
                          return_train_score=True)


In [94]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num_pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('num_imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                      

In [95]:
grid_search.best_params_

{'tree_clf__criterion': 'gini', 'tree_clf__max_depth': 7}

In [96]:
tree_clf_best = grid_search.best_estimator_

In [98]:
from sklearn.svm import SVC
svm_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('svm', SVC()),
    ]
    )

In [99]:
from sklearn.model_selection import GridSearchCV


param_grid_2 = [
    {
      
        'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 
        'svm__degree': [2, 3, 4, 5],
        'svm__C': [1, 10, 100],
    }
]

# set up the grid search 
grid_search_2 = GridSearchCV(svm_clf, param_grid_2, cv=10,
                          scoring='accuracy',
                          return_train_score=True)

In [100]:
grid_search_2.fit(X_train, y_train)

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 m

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 m

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 m

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 m

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4044x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 m

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.

ValueError: Precomputed matrix must be a square matrix. Input is a 4045x46 matrix.



GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num_pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('num_imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                      

In [101]:
grid_search_2.best_params_


{'svm__C': 10, 'svm__degree': 2, 'svm__kernel': 'poly'}

In [102]:
svm_best=grid_search_2.best_estimator_


In [111]:

log_reg_clf.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('num_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                  

In [114]:
# predict
y_pred_tree=tree_clf_best.predict(X_test)
y_pred_log_reg = log_reg_clf.predict(X_test)
y_pred_svm_reg = svm_best.predict(X_test)


from sklearn.metrics import accuracy_score



print(f'Accuracy Score Tree : {accuracy_score(y_test,y_pred_tree)}')
print(f'Accuracy Score Log : {accuracy_score(y_test,y_pred_log_reg)}')
print(f'Accuracy Score SVM : {accuracy_score(y_test,y_pred_svm_reg)}')

Accuracy Score Tree : 0.7864768683274022
Accuracy Score Log : 0.7927046263345195
Accuracy Score SVM : 0.7820284697508897


In [113]:
churn_test = pd.read_csv('udel-churn-test.csv')

y_pred_churn = log_reg_clf.predict(churn_test)

churn_submit = pd.DataFrame({
    'customerID': churn_test['customerID'],
    'Churn': y_pred_churn
})

# generate the csv
churn_submit.to_csv('churn-submit.csv', index=False)