# Market Basket analysis

### Author: Rasmus Davidsen
### Copenhagen, 02-12-2022


### Steps 
1) import and preprocess data 

2) profile data 

4) split data 

3) define and train pipeline


5) 

In [204]:
# for dataframes. options is set so the entire table can be inspected
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 7]

In [205]:
# import semicolon separated .csv file and inspect the shape
df = pd.read_csv('data/case_data_set.csv', delimiter=";")
df.shape

(5374, 21)

In [206]:
# the categorical features to the model
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
            'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
            'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'Churn']

df = df[categorical_cols]

#df = df[df['Churn']=='Yes']
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,Churn
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,No
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,No
2,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,No
3,Male,0,No,Yes,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,No
4,Female,0,No,No,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,No


In [207]:
#import pandas as pd

df_enc = pd.get_dummies(df)
df_enc.head()

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,Churn_No,Churn_Yes
0,0,1,0,0,1,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0
1,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0
2,0,0,1,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,1,0
3,0,0,1,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,1,0
4,0,1,0,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0


In [208]:
from mlxtend.frequent_patterns import apriori, association_rules

itemsets = apriori(df_enc, min_support=0.05, use_colnames=True, max_len = 2)

In [209]:
rules = association_rules(itemsets, metric='lift', min_threshold = 1).sort_values('lift', ascending=False).reset_index(drop=True)


rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(PhoneService_No),(MultipleLines_No phone service),0.098437,0.098437,0.098437,1.0,10.15879,0.088747,inf
1,(MultipleLines_No phone service),(PhoneService_No),0.098437,0.098437,0.098437,1.0,10.15879,0.088747,inf
2,(InternetService_No),(DeviceProtection_No internet service),0.264421,0.264421,0.264421,1.0,3.781844,0.194503,inf
3,(OnlineSecurity_No internet service),(DeviceProtection_No internet service),0.264421,0.264421,0.264421,1.0,3.781844,0.194503,inf
4,(StreamingTV_No internet service),(TechSupport_No internet service),0.264421,0.264421,0.264421,1.0,3.781844,0.194503,inf


In [210]:
fname = 'Churn'
contain_values = rules[rules['consequents'].astype(str).str.contains(fname)]

In [211]:
contain_values

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
512,(Contract_Two year),(Churn_No),0.307778,0.962784,0.306476,0.995768,1.034259,0.010152,8.793663
520,(StreamingMovies_No internet service),(Churn_No),0.264421,0.962784,0.262933,0.99437,1.032807,0.008352,6.610532
521,(DeviceProtection_No internet service),(Churn_No),0.264421,0.962784,0.262933,0.99437,1.032807,0.008352,6.610532
523,(InternetService_No),(Churn_No),0.264421,0.962784,0.262933,0.99437,1.032807,0.008352,6.610532
525,(TechSupport_No internet service),(Churn_No),0.264421,0.962784,0.262933,0.99437,1.032807,0.008352,6.610532
526,(StreamingTV_No internet service),(Churn_No),0.264421,0.962784,0.262933,0.99437,1.032807,0.008352,6.610532
527,(OnlineBackup_No internet service),(Churn_No),0.264421,0.962784,0.262933,0.99437,1.032807,0.008352,6.610532
529,(OnlineSecurity_No internet service),(Churn_No),0.264421,0.962784,0.262933,0.99437,1.032807,0.008352,6.610532
538,(Contract_One year),(Churn_No),0.24693,0.962784,0.243208,0.984928,1.023001,0.005468,2.469297
550,(Dependents_Yes),(Churn_No),0.338295,0.962784,0.331969,0.981298,1.01923,0.006263,1.989974


In [215]:
df['Churn'].value_counts()

No     5174
Yes     200
Name: Churn, dtype: int64

In [222]:
base_conf = 1-200/5174
base_conf

0.9613451874758407

In [229]:
(0.995768/base_conf)

1.0358069223964617