In [1]:
import pandas as pd
import numpy as np

In [2]:
#load dataset
dataset = pd.read_csv("/content/drive/MyDrive/data_mining/credit_standing.csv")
dataset

Unnamed: 0,Checking Acct,Credit Hist,Purpose,Savings Acct,Employment,Gender,Marital Status,Housing,Job,Telephone,Foreign,Age,Credit Standing
0,0Balance,Current,Small Appliance,Low,Short,M,Single,Own,Unskilled,Yes,Yes,23,Good
1,0Balance,Current,Furniture,MedLow,Unemployed,M,Divorced,Own,Skilled,Yes,Yes,32,Bad
2,No Acct,Bank Paid,Car New,Low,Long,M,Single,Own,Management,No,Yes,38,Bad
3,Low,Current,Furniture,Low,Short,M,Single,Own,Unskilled,Yes,Yes,36,Bad
4,Low,Delay,Education,MedLow,Medium,M,Single,Rent,Skilled,No,Yes,31,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,No Acct,Critical,Small Appliance,No Acct,Long,M,Single,Other,Skilled,Yes,Yes,35,Good
421,0Balance,Current,Furniture,No Acct,Long,M,Single,Own,Skilled,Yes,Yes,30,Bad
422,0Balance,Current,Car New,No Acct,Long,F,Divorced,Own,Skilled,Yes,Yes,28,Bad
423,0Balance,Current,Car New,Low,Short,F,Divorced,Own,Skilled,No,Yes,28,Bad


In [3]:
#data cleaning/change col names
df = dataset.rename(columns = {'Checking Acct':'checking_acc', 
                                        'Credit Hist':'credit_hist', 
                                        'Purpose':'purpose',
                                        'Savings Acct':'savings_acc', 
                                        'Employment':'employment',
                                        'Gender':'gender', 
                                        'Marital Status':'marital_status', 
                                        'Housing':'housing', 
                                        'Job':'job', 
                                        'Telephone':'telephone', 
                                        'Foreign':'foreign',
                                        'Age':'age', 
                                        'Credit Standing':'credit_standing'
                                        },inplace=False)

In [4]:
df.columns

Index(['checking_acc', 'credit_hist', 'purpose', 'savings_acc', 'employment',
       'gender', 'marital_status', 'housing', 'job', 'telephone', 'foreign',
       'age', 'credit_standing'],
      dtype='object')

In [58]:
#get total number of instances in while data
all_total = df['credit_standing'].count()

#create rule
import random

lists = df.columns.tolist()[:-1]
rule = random.sample(lists, 2)
rule.append("credit_standing")

#generate combinations of unique value base from rules
totals = df.groupby(rule[:-1]).size()
combinations = totals.keys()
res = df.groupby(rule).size().unstack(fill_value=0).stack()
res

housing  marital_status  credit_standing
Other    Divorced        Bad                 8
                         Good                3
         Single          Bad                23
                         Good               18
Own      Divorced        Bad                52
                         Good               46
         Married         Bad                13
                         Good               17
         Single          Bad                66
                         Good               98
Rent     Divorced        Bad                31
                         Good               16
         Married         Bad                 4
                         Good                2
         Single          Bad                14
                         Good               14
dtype: int64

In [59]:
#creata pandas sheet for values
data = pd.DataFrame({"Itemset": list(combinations[i] for i in range(len(combinations))),
                     "Count": list(totals[i] for i in range(len(totals)))
                     })

#solve for support and confidence
column = res.index.get_level_values(rule[-1]).unique()
for i in range(len(column)):
    data[column[i]] = list(res[i::len(column)])
    data["Support "+ column[i]] = data[column[i]]/all_total
    data["Confidence "+ column[i]] = data[column[i]]/data["Count"]

data

Unnamed: 0,Itemset,Count,Bad,Support Bad,Confidence Bad,Good,Support Good,Confidence Good
0,"(Other, Divorced)",11,8,0.018824,0.727273,3,0.007059,0.272727
1,"(Other, Single)",41,23,0.054118,0.560976,18,0.042353,0.439024
2,"(Own, Divorced)",98,52,0.122353,0.530612,46,0.108235,0.469388
3,"(Own, Married)",30,13,0.030588,0.433333,17,0.04,0.566667
4,"(Own, Single)",164,66,0.155294,0.402439,98,0.230588,0.597561
5,"(Rent, Divorced)",47,31,0.072941,0.659574,16,0.037647,0.340426
6,"(Rent, Married)",6,4,0.009412,0.666667,2,0.004706,0.333333
7,"(Rent, Single)",28,14,0.032941,0.5,14,0.032941,0.5


In [40]:
#generate findings
for i in range(len(column)):
  item_y = str(column[i])
  value_supp = data.iloc[data["Support "+column[i]].idxmax()]["Support "+column[i]]
  value_conf = data.iloc[data["Confidence "+column[i]].idxmax()]["Confidence "+column[i]]

  index_supp = data["Support "+column[i]].idxmax()
  itemset_supp = data.iloc[index_supp]["Itemset"]


  print(f"Highest support {item_y}: {value_supp}, Itemset: {itemset_supp}")
  print(f"Highest confidence {item_y}: {value_conf}\n")


Highest support Divorced: 0.13176470588235295, Itemset: Short
Highest confidence Divorced: 0.6266666666666667

Highest support Married: 0.03294117647058824, Itemset: Short
Highest confidence Married: 0.14666666666666667

Highest support Single: 0.18588235294117647, Itemset: Long
Highest confidence Single: 0.7523809523809524

