# Market Basket analysis

### Author: Rasmus Davidsen
### Copenhagen, 02-12-2022


### Steps 
1) import and preprocess data 

2) profile data 

4) split data 

3) define and train pipeline


5) 

In [1]:
# for dataframes. options is set so the entire table can be inspected
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 7]

In [2]:
# import semicolon separated .csv file and inspect the shape
df = pd.read_csv('data/case_data_set.csv', delimiter=";")
df.shape

(5374, 21)

In [5]:
# the categorical features to the model
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
            'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
            'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'Churn']

df = df[categorical_cols]
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,Churn
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,No
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,No
2,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,No
3,Male,0,No,Yes,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,No
4,Female,0,No,No,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,No


In [6]:
#import pandas as pd

df_enc = pd.get_dummies(df)
df_enc.head()

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,Churn_No,Churn_Yes
0,0,1,0,0,1,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0
1,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0
2,0,0,1,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,1,0
3,0,0,1,1,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,1,0
4,0,1,0,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0


In [115]:
from mlxtend.frequent_patterns import apriori, association_rules

itemsets = apriori(df_enc, min_support=0.4, use_colnames=True, max_len = 10)

In [116]:
rules = association_rules(itemsets, metric='lift', min_threshold = 1).sort_values('lift', ascending=False).reset_index(drop=True)

rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Dependents_No),(Partner_No),0.661705,0.477484,0.422962,0.639201,1.338686,0.107009,1.44822
1,(Partner_No),(Dependents_No),0.477484,0.661705,0.422962,0.885814,1.338686,0.107009,2.962683
2,(Dependents_No),"(Partner_No, Churn_No)",0.661705,0.454224,0.401377,0.60658,1.335421,0.100815,1.387262
3,"(Partner_No, Churn_No)",(Dependents_No),0.454224,0.661705,0.401377,0.883654,1.335421,0.100815,2.907674
4,"(Dependents_No, Churn_No)",(Partner_No),0.630815,0.477484,0.401377,0.636283,1.332574,0.100173,1.436601


In [117]:
fname = 'Churn_No'
contain_values = rules[rules['consequents'].astype(str).str.contains(fname)]


In [118]:
contain_values

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(Dependents_No),"(Partner_No, Churn_No)",0.661705,0.454224,0.401377,0.60658,1.335421,0.100815,1.387262
5,(Partner_No),"(Dependents_No, Churn_No)",0.477484,0.630815,0.401377,0.840608,1.332574,0.100173,2.316207
6,(MultipleLines_No),"(PhoneService_Yes, Churn_No)",0.488277,0.86751,0.472832,0.968369,1.116262,0.049247,4.188592
12,(PhoneService_Yes),"(MultipleLines_No, Churn_No)",0.901563,0.472832,0.472832,0.524458,1.109185,0.046544,1.108563
14,(Partner_Yes),"(PhoneService_Yes, Churn_No)",0.522516,0.86751,0.462039,0.884259,1.019307,0.008752,1.144712
16,"(PhoneService_Yes, PaperlessBilling_No)",(Churn_No),0.411239,0.962784,0.401935,0.977376,1.015156,0.006001,1.644957
19,(PaperlessBilling_No),(Churn_No),0.457574,0.962784,0.447153,0.977227,1.015001,0.006609,1.634191
21,(PaperlessBilling_No),"(PhoneService_Yes, Churn_No)",0.457574,0.86751,0.401935,0.878406,1.01256,0.004986,1.089606
22,"(PhoneService_Yes, Partner_Yes)",(Churn_No),0.474693,0.962784,0.462039,0.973344,1.010968,0.005013,1.396156
25,(Partner_Yes),(Churn_No),0.522516,0.962784,0.50856,0.973291,1.010913,0.00549,1.393376


1.3.4


In [43]:
import pandas as pd

#create sample data
data = {'model': ['Lisa', 'Lisa 2', 'Macintosh 128K', 'Macintosh 512K'],
        'launched': [1983,1984,1984,1984],
        'discontinued': [1986, 1985, 1984, 1986]}

df = pd.DataFrame(data, columns = ['model', 'launched', 'discontinued'])
df.dtypes

model           object
launched         int64
discontinued     int64
dtype: object

In [44]:
df[df['model'].str.contains('Mac')]

Unnamed: 0,model,launched,discontinued
2,Macintosh 128K,1984,1984
3,Macintosh 512K,1984,1986


### on dataframe with only churn yes