In [1]:
import ast
import os
import sys
import time

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
project_home = os.path.join("../")
sys.path.append(project_home)
data_path = os.path.join(project_home, "data/merged_preprocessed.csv")

In [3]:
from lib.apriori import runApriori

In [4]:
df = pd.read_csv(data_path, encoding="latin1")

In [5]:
len(df)

375197

In [6]:
df.columns

Index(['college_name', 'levell', 'programme', 'discipline_group', 'discipline',
       'type', 'year', 'total_general_total', 'total_general_females',
       'total_backward_castes_total', 'total_backward_castes_females',
       'total_total_persons', 'total_total_females', 'pwd_general_total',
       'pwd_general_females', 'pwd_backward_castes_total',
       'pwd_backward_castes_females', 'pwd_total_persons', 'pwd_total_females',
       'muslim_minority_general_total', 'muslim_minority_general_females',
       'muslim_minority_backward_castes_total',
       'muslim_minority_backward_castes_females',
       'muslim_minority_total_persons', 'muslim_minority_total_females',
       'other_minority_general_total', 'other_minority_general_females',
       'other_minority_backward_castes_total',
       'other_minority_backward_castes_females',
       'other_minority_total_persons', 'other_minority_total_females', 'state',
       'city', 'speciality', 'girl_exclusive', 'student_hostel_availab

### Converting to required dataset format

In [7]:
columns_set_1 = ["levell", "programme", "total_general_total", "total_backward_castes_total", "pwd_total_persons", 
                "muslim_minority_total_persons", "other_minority_total_persons"
                ]
columns_set_2 = ["programme", "total_general_females", "total_backward_castes_females", "pwd_total_females",
                 "muslim_minority_total_females", "other_minority_total_females"
                ]

In [8]:
def binning(columns, df, bins=[(0,0.2), (0.2,0.5), (0.5,1.01)]):
    for column in columns:
        for bin_interval in bins:
            name = column + " [{}-{}]".format(bin_interval[0], bin_interval[1])
            df[name] = (df[column] >= bin_interval[0]) & (df[column] < bin_interval[1])
        df.drop(column, axis=1, inplace=True)
    
    return df

In [9]:
def one_hot_enc(column_name, df):
    d = pd.get_dummies(df[column_name])
    df = pd.concat([d, df], axis=1)
    df.drop(column_name, axis=1, inplace=True)
    return df

In [10]:
def convert_to_apriori_dataset(apriori_df, non_bool_cols = ["level", "programme"], 
                               bool_col_names=["general", "BC", "PWD", "muslim", "other"]):
    new_df = apriori_df[non_bool_cols]
    apriori_df.drop(columns=non_bool_cols, axis=1, inplace=True)
    new_col = apriori_df.apply(lambda x: apriori_df.columns[x].tolist(), axis=1)
    new_col = pd.DataFrame(new_col.tolist(), columns = bool_col_names)
    new_df = pd.concat([new_df, new_col], axis=1)
    
    return new_df

In [11]:
apriori_df = df[columns_set_1].copy()
apriori_df.columns = ["level", "programme", "general", "BC", "PWD", "muslim", "other"]

In [12]:
cols = ["general", "BC", "PWD", "muslim", "other"]
bins = [(0, 0.2), (0.2,0.5), (0.5,0.7), (0.7,1.01)]
apriori_df = binning(cols, apriori_df, bins)

In [13]:
apriori_df.head()

Unnamed: 0,level,programme,general [0-0.2],general [0.2-0.5],general [0.5-0.7],general [0.7-1.01],BC [0-0.2],BC [0.2-0.5],BC [0.5-0.7],BC [0.7-1.01],...,PWD [0.5-0.7],PWD [0.7-1.01],muslim [0-0.2],muslim [0.2-0.5],muslim [0.5-0.7],muslim [0.7-1.01],other [0-0.2],other [0.2-0.5],other [0.5-0.7],other [0.7-1.01]
0,Under Graduate,B.Sc.-Bachelor of Science,False,True,False,False,False,False,True,False,...,False,False,True,False,False,False,True,False,False,False
1,Under Graduate,B.Tech.-Bachelor of Technology,False,False,False,True,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
2,Under Graduate,B.C.A.-Bachelor of Computer Applications,False,False,False,True,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
3,Under Graduate,B.H.M.-Bachelor of Hotel Management,False,False,False,True,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
4,Under Graduate,B.P.T.-Bachelor of Physiotherapy,False,False,True,False,False,True,False,False,...,False,False,True,False,False,False,True,False,False,False


In [14]:
non_bool_cols = ["level", "programme", "state", "speciality"]
bool_col_names = ["general", "BC", "PWD", "muslim", "other"]

In [15]:
%%time
apriori_df = convert_to_apriori_dataset(apriori_df)

CPU times: user 29.5 s, sys: 24.9 ms, total: 29.5 s
Wall time: 29.6 s


In [16]:
extra_df = df[["state", "speciality", "girl_exclusive", "student_hostel_available"]].copy()

In [17]:
extra_df["girl_exclusive"] = extra_df["girl_exclusive"].apply(lambda x: "Girl_exclusive" if x else "Co-ed")
extra_df["student_hostel_available"] = extra_df["student_hostel_available"].apply(
    lambda x: "Hostel_available" if x else "No_hostel"
)
extra_df["speciality"] = extra_df["speciality"].replace({'0': "No speciality"})

In [18]:
apriori_df = pd.concat([apriori_df, extra_df], axis=1)
del extra_df

In [19]:
apriori_df

Unnamed: 0,level,programme,general,BC,PWD,muslim,other,state,speciality,girl_exclusive,student_hostel_available
0,Under Graduate,B.Sc.-Bachelor of Science,general [0.2-0.5],BC [0.5-0.7],PWD [0-0.2],muslim [0-0.2],other [0-0.2],Uttar Pradesh,No speciality,Co-ed,No_hostel
1,Under Graduate,B.Tech.-Bachelor of Technology,general [0.7-1.01],BC [0-0.2],PWD [0-0.2],muslim [0-0.2],other [0-0.2],West Bengal,Engineering & Technology,Co-ed,No_hostel
2,Under Graduate,B.C.A.-Bachelor of Computer Applications,general [0.7-1.01],BC [0-0.2],PWD [0-0.2],muslim [0-0.2],other [0-0.2],Gujarat,Computer Application,Co-ed,Hostel_available
3,Under Graduate,B.H.M.-Bachelor of Hotel Management,general [0.7-1.01],BC [0-0.2],PWD [0-0.2],muslim [0-0.2],other [0-0.2],West Bengal,No speciality,Co-ed,No_hostel
4,Under Graduate,B.P.T.-Bachelor of Physiotherapy,general [0.5-0.7],BC [0.2-0.5],PWD [0-0.2],muslim [0-0.2],other [0-0.2],Gujarat,Physiotherapy,Co-ed,Hostel_available
...,...,...,...,...,...,...,...,...,...,...,...
375192,Under Graduate,B.Sc.-Bachelor of Science,general [0-0.2],BC [0.7-1.01],PWD [0-0.2],muslim [0-0.2],other [0-0.2],Uttar Pradesh,No speciality,Co-ed,No_hostel
375193,Under Graduate,M.B.B.S.-Bachelor of Medicine and Bachelor of ...,general [0.5-0.7],BC [0.2-0.5],PWD [0-0.2],muslim [0-0.2],other [0-0.2],Haryana,Medical-Allopathy,Co-ed,Hostel_available
375194,Post Graduate,M.D.-Doctor of Medicine,general [0-0.2],BC [0-0.2],PWD [0-0.2],muslim [0-0.2],other [0-0.2],Haryana,Medical-Allopathy,Co-ed,Hostel_available
375195,Under Graduate,B.A.-Bachelor of Arts,general [0.7-1.01],BC [0-0.2],PWD [0-0.2],muslim [0-0.2],other [0-0.2],Uttar Pradesh,No speciality,Co-ed,No_hostel


### Frequent pattern and rule gen

In [None]:
def data_generator(df):
    for idx in range(len(df)):
        record = frozenset(df.iloc[idx])
        yield record


In [None]:
data = data_generator(apriori_df)

In [None]:
data

In [None]:
%%time
items, rules = runApriori(data, minSupport=0.15, minConfidence=0.6)

In [None]:
sorted(items, key=lambda x: x[1], reverse=True)

In [None]:
sorted(rules, key=lambda x: x[1], reverse=True)

In [None]:
rules_list = []
for rule in rules:
    rules_list.append({
        "precedent": rule[0][0],
        "antecendent": rule[0][1],
        "confidence": rule[1]
    })
rules_df = pd.DataFrame(rules_list)

In [None]:
rules_df.to_csv("../results/apriori_latest.csv", index=False)

### Lift calculation

In [20]:
full_set = apriori_df.values

In [21]:
super_set = []
for row in full_set:
    super_set.append(set(row))

In [22]:
del full_set

In [23]:
super_set

[{'B.Sc.-Bachelor of Science',
  'BC [0.5-0.7]',
  'Co-ed',
  'No speciality',
  'No_hostel',
  'PWD [0-0.2]',
  'Under Graduate',
  'Uttar Pradesh',
  'general [0.2-0.5]',
  'muslim [0-0.2]',
  'other [0-0.2]'},
 {'B.Tech.-Bachelor of Technology',
  'BC [0-0.2]',
  'Co-ed',
  'Engineering & Technology',
  'No_hostel',
  'PWD [0-0.2]',
  'Under Graduate',
  'West Bengal',
  'general [0.7-1.01]',
  'muslim [0-0.2]',
  'other [0-0.2]'},
 {'B.C.A.-Bachelor of Computer Applications',
  'BC [0-0.2]',
  'Co-ed',
  'Computer Application',
  'Gujarat',
  'Hostel_available',
  'PWD [0-0.2]',
  'Under Graduate',
  'general [0.7-1.01]',
  'muslim [0-0.2]',
  'other [0-0.2]'},
 {'B.H.M.-Bachelor of Hotel Management',
  'BC [0-0.2]',
  'Co-ed',
  'No speciality',
  'No_hostel',
  'PWD [0-0.2]',
  'Under Graduate',
  'West Bengal',
  'general [0.7-1.01]',
  'muslim [0-0.2]',
  'other [0-0.2]'},
 {'B.P.T.-Bachelor of Physiotherapy',
  'BC [0.2-0.5]',
  'Co-ed',
  'Gujarat',
  'Hostel_available',
  'P

In [None]:
## Run this is running only for lift calc on stored results
results = pd.read_csv("../results/apriori_latest.csv")

In [None]:
## If results exist as a variable in notebook
results = rules_df

In [None]:
results.columns = ["antecedent", "consequent", "confidence"]

In [None]:
set(list(ast.literal_eval(results["antecedent"][0]))).issubset(super_set[1])

In [None]:
def count_subsets(super_set, ant):
    c=0
    for s in super_set:
        if ant.issubset(s):
            c += 1
    return c

In [None]:
results["lift"] = None

In [None]:
def calc_lift(results, super_set):
    lift = []
    for row in tqdm(results.itertuples()):
        ant = set(list(ast.literal_eval(row.antecedent)))
        c = count_subsets(super_set, ant)
        lift.append(row.confidence / c)

In [None]:
lift = []
for row in tqdm(results.itertuples()):
    ant = set(list(ast.literal_eval(row.antecedent)))
    c = count_subsets(super_set, ant)
    lift.append(row.confidence / c)

In [None]:
results["lift"] = lift

In [None]:
results["lift"] *= len(df)

In [None]:
results.to_csv("../results/apriori_with_lift.csv", index=False)

### Verify results using mlxtend library

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
columns_set_1 = ["levell", "programme", "total_general_total", "total_backward_castes_total", "pwd_total_persons", 
                "muslim_minority_total_persons", "other_minority_total_persons",
                 "state", "speciality", "girl_exclusive", "student_hostel_available"
                ]

In [None]:
apriori_df = df[columns_set_1].copy()
apriori_df.columns = ["level", "programme", "general", "BC", "PWD", "muslim", "other",
                     "state", "speciality", "girl_exclusive", "student_hostel_available"]

In [None]:
# Only run this when using mlxtend library :
apriori_df = one_hot_enc("programme", apriori_df)
apriori_df = one_hot_enc("level", apriori_df)
apriori_df = one_hot_enc("state", apriori_df)
apriori_df = one_hot_enc("speciality", apriori_df)
apriori_df = one_hot_enc("girl_exclusive", apriori_df)
apriori_df = one_hot_enc("student_hostel_available", apriori_df)

In [None]:
cols = ["general", "BC", "PWD", "muslim", "other"]
bins = [(0, 0.2), (0.2,0.5), (0.5,0.7), (0.7,1.01)]
apriori_df = binning(cols, apriori_df, bins)

In [None]:
%%time
frequent_patterns = apriori(apriori_df, min_support=0.15, use_colnames=True)

In [None]:
frequent_patterns.sort_values(by="support", ascending=False)

In [None]:
%%time
assoc_rules = association_rules(frequent_patterns, metric="confidence", min_threshold=0.6)

In [None]:
assoc_rules

In [None]:
rules.to_csv("../association_rules_sup_0.1_conf_0.6.csv", index=False)