In [None]:
!pip install pymysql
!pip install apyori
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install python-dotenv
!pip install pydot
!pip install kmodes

In [None]:
# Imported Libraries
import pandas as pd
import pymysql
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from apyori import apriori
# Imports for the neural network
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.linear_model import LogisticRegression
import pickle

#Imports for Clustering
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes
from sklearn.metrics import silhouette_score


#######Connection to Client Database-- Needs Changes
host="qut-ds.c2m09j1oykve.ap-southeast-2.rds.amazonaws.com"
port=4005
dbname="qut_ds1"
user="qut_ds1"
password="GaAVSqC#9JR8"
conn = pymysql.connect(host=host, user=user,port=port,password=password, db=dbname)


print(conn)

### Getting Customer-users detail - make dataframe

In [None]:
###Customer Personal details

# ! Added Member key
df_user= pd.read_sql("select m.id as member_id, m.membership_number, m.status, m.price_zone_code, m.member_key, s.u_ndis_number, s.u_disabilities, s.u_gender, s.u_date_of_birth, r.SA1, r.SA2, r.SA3,r.SA4  from SNOW_csm_consumer_user s left join  HH_member m  on s.u_ndis_number = m.membership_number left join libe_leapinprod_memberregion r on r.MemberId = s.u_leapin_id where s.u_stage = 'li_managed' and s.u_ndis_number is not null;", con=conn)
                       
df_user.info()
df_user.head(50)


In [None]:
df_user['u_disabilities'].value_counts()

### Getting Claims and Invoice details - make dataframe

In [None]:
# Getting Claims details: 

df_claims= pd.read_sql("SELECT c.id as claim_id, c.invoice_id, c.state, c.risk_level, c.start_date FROM HH_claim c;", con=conn)
df_claims.info()

df_claims.head(10)


In [None]:
# Getting Invoice Details

df_invoices= pd.read_sql("SELECT i.id as invoice_id, i.member_id, i.invoice_total, i.funded_total, i.funded_date FROM HH_invoice i;", con=conn)
df_invoices.info()
df_invoices.head(10)

In [None]:
# Merging

df_invoice_claim = pd.merge(df_claims, df_invoices, on="invoice_id", how="left")

df_invoice_claim.info()
df_invoice_claim.head()

In [None]:

# Mapping users' disability type:

df = pd.merge(df_invoice_claim, df_user, on="member_id", how="left")
##Will be used separately for clustering
df_for_cluster = df.copy()
df

In [None]:
#Groupby member_id 

df1 = df.groupby(["membership_number"]).agg({'invoice_total': 'sum', 'funded_total': 'sum'}).reset_index()

df1.info()
df1


In [None]:
#subtract the invoice_total to fudned_total to see how user get reimbursemente - creat subtraction column

df1['subtraction'] = df1['invoice_total'] - df1['funded_total']
df1

In [None]:
# Merging again

df2 = pd.merge(df1, df_user, on="membership_number", how="left")
df2.info()


In [None]:
df2.head(20)

In [None]:
#Select interested columns

cols_of_interest = {'membership_number', 'invoice_total', 'funded_total', 'subtraction', 'status', 'price_zone_code', 
                   'u_disabilities', 'u_gender'}
df2 = df2[cols_of_interest]
df2.info()

In [None]:
#Re-arrange that mess

df2 = df2[['membership_number', 'invoice_total', 'funded_total', 'subtraction',   
                   'u_disabilities', 'u_gender','price_zone_code','status']]

df2.head(20)

In [None]:
#Checking u_dis

df2['u_disabilities'].value_counts()

In [None]:
# Replace blank value with 'others'

df2['u_disabilities'] = df2['u_disabilities'].replace([''],'others')

df2['u_disabilities'].value_counts()

In [None]:
# Aggregate all categories != others into Intellectual

df2['u_disabilities'].mask(df2['u_disabilities'] != 'others', "Intelectual", inplace = True)

In [None]:
df2['u_disabilities'].value_counts()

In [None]:
df2.info()

In [None]:
#Exploring to have a clearer view on the overall of reimbursement:

sub_0 = df2[df2['subtraction']> 1000]
#sub_0

sub_1 = df2[df2['subtraction']> 10000]
sub_1

#Set value for rows matching condition

#df2[['subtraction'] > 1000] == 'High'
#df2


In [None]:
# Modifications to df2
cleaned_df2 = df2

# gender		: Add "other" for blank or null values --> Male/Female/Other/Unknown
cleaned_df2["u_gender"] = cleaned_df2["u_gender"].replace([""],"Unknown")

In [None]:
# status		: drop
cleaned_df2 = cleaned_df2.drop(columns=["status"])

In [None]:
# price_code	: drop, replace with SA4 and its information
cleaned_df2 = cleaned_df2.drop(columns=["price_zone_code"])
df_user_min = df_user[["membership_number", "SA4"]]

In [None]:
# Remove rows with blank SA1 - SA4 values
blankIndices = df_user_min[df_user_min["SA4"] == 0.0].index
df_user_min = df_user_min.drop(axis=0, labels=blankIndices)

In [None]:
# Drop rows with null SA1-4 values 
df_user_min = df_user_min.dropna()
# Drop rows with duplicate membership_number values 
df_user_min = df_user_min.drop_duplicates()

In [None]:
# Obtain geographical locations using SA4
df_sa4 = pd.read_csv("./TableDump/SA4_2016.csv",sep='\t')
df_sa4

In [None]:
#Split values into columns

df_sa4[['SA4_CODE_2016','SA4_NAME_2016','GCCSA_CODE_2016','GCCSA_NAME_2016','STATE_CODE_2016','STATE_NAME_2016','AREA_ALBERS_SQKM']] = df_sa4['SA4_CODE_2016,SA4_NAME_2016,GCCSA_CODE_2016,GCCSA_NAME_2016,STATE_CODE_2016,STATE_NAME_2016,AREA_ALBERS_SQKM'].str.split(',',expand=True)
df_sa4

In [None]:
# Extract relevant columns
df_sa4 = df_sa4[["SA4_CODE_2016", "SA4_NAME_2016", "GCCSA_NAME_2016", "STATE_NAME_2016"]]
# Rename columns to match df2
name_mapping = {
    "SA4_CODE_2016": "SA4",
    "SA4_NAME_2016": "SA4_NAME",
    "GCCSA_NAME_2016": "GCCSA_NAME",
    "STATE_NAME_2016": "STATE_NAME"
}
df_sa4 = df_sa4.rename(columns=name_mapping)
df_sa4

In [None]:
#Set float for SA4:
df_sa4['SA4'] = df_sa4['SA4'].astype(float, errors = 'raise')


In [None]:
# Merge SA4 information with the user DF
df_user_min = pd.merge(df_user_min, df_sa4, on= "SA4", how="left")
# Merge the final user info with DF2
cleaned_df2 = pd.merge(cleaned_df2, df_user_min, on="membership_number", how="left")


In [None]:
# Add Member_key to deal with invoice calculations
cleaned_df2 = pd.merge(cleaned_df2, df_user[["membership_number", "member_key"]], on="membership_number", how="left")
# Drop duplicated member keys
cleaned_df2 = cleaned_df2.drop_duplicates()

In [None]:
# Display Summary information for cleaned_df2
cleaned_df2.info()
cleaned_df2.head(100)

In [None]:
# Get Completed Plans
df_plan = pd.read_sql("select p.plan_key, p.member_key, p.status, p.start_date, p.end_date, pb.item_category_level2_key, pb.allocation, pb.remaining from HH_plan p join HH_plan_budget pb on p.plan_key = pb.plan_key where p.status = 'COMPLETED'", con=conn)
# Convert start and end_date to datetime
df_plan["start_date"] = pd.to_datetime(df_plan["start_date"], format="%Y-%m-%d")
df_plan["end_date"] = pd.to_datetime(df_plan["end_date"], format="%Y-%m-%d")
# Summary info for df_plan
df_plan.info()
df_plan.head(100)


In [None]:
df_plan2 = df_plan

# Convert start and end_date to datetime
df_plan2["start_date"] = pd.to_datetime(df_plan2["start_date"], format="%Y-%m-%d")
df_plan2["end_date"] = pd.to_datetime(df_plan2["end_date"], format="%Y-%m-%d")


In [None]:
# Sum up all based on plan_key

df_grouped_plan2 = df_plan2.groupby(["plan_key"]).agg({"allocation": "sum", "remaining": "sum", "member_key": "first", "start_date": "first", "end_date": "first"}).reset_index()

# Sort grouped df by start and end dates in descending order

df_grouped_plan2 = df_grouped_plan2.sort_values(["start_date", "end_date"], ascending=[False, False])

# Eliminate duplicate member keys by dropping all rows but the most recent ones

df_grouped_plan2 = df_grouped_plan2.groupby(["member_key"]).agg({"start_date": "first", "end_date": "first", "plan_key": "first", "allocation": "first", "remaining": "first"}).reset_index()


In [None]:
# Summary information for df_plan2

df_grouped_plan2.info()
df_grouped_plan2.head(100)

In [None]:
# Merge plan information with cleaned_df2

temp_df2 = pd.merge(cleaned_df2, df_grouped_plan2, on="member_key", how="left")

# Drop members without completed plans

temp_df2 = temp_df2.dropna(subset=["plan_key"])


In [None]:
#Calculate ratio of spent-allocated

temp_df2["spending_ratio"] = (temp_df2["allocation"] - temp_df2["remaining"]) / temp_df2["allocation"]
temp_df2["spent"] = temp_df2["allocation"] - temp_df2["remaining"]

# Variables defining under and appropriate spending ratio thresholds 
# TODO: CHECK AND MODIFY VALUES HERE
                                                      
under_spend_thres = 0.75
par_spend_thres = 1.0
temp_df2["under_spent"] = temp_df2["spending_ratio"] <= under_spend_thres
temp_df2["over_spent"] = temp_df2["spending_ratio"] > par_spend_thres
temp_df2["par_spent"] = temp_df2["spending_ratio"] == par_spend_thres

In [None]:
# Drop outdated columns

temp_df2 = temp_df2.drop(columns=["invoice_total", "funded_total"])

In [None]:
# Display Summary information for temp_df2

temp_df2.info()
temp_df2.head(100)
temp_df2["under_spent"].value_counts()


In [None]:
# One-Hot Encoding of temp_df2

one_hot_df2 = temp_df2.set_index("membership_number")

# One-hot encoding of Gender

one_hot_gender = pd.get_dummies(one_hot_df2["u_gender"], prefix="gender")

# Merge with the cleaned DF2

one_hot_df2 = pd.merge(one_hot_df2, one_hot_gender, on="membership_number", how="left")


In [None]:
# One-hot encoding of GCCSA

# Compile list of Greater regions

greaterRegions = one_hot_df2[one_hot_df2["GCCSA_NAME"].str.contains("Greater", na=False)]["GCCSA_NAME"].value_counts().index.to_list()

# Compile list of "Rest of..." regions

restOfRegions = one_hot_df2[one_hot_df2["GCCSA_NAME"].str.contains("Rest of", na=False)]["GCCSA_NAME"].value_counts().index.to_list()

# Compile list of "Capital" regions

capitalRegions = one_hot_df2[one_hot_df2["GCCSA_NAME"].str.contains("Capital", na=False)]["GCCSA_NAME"].value_counts().index.to_list()

# Replace "Greater" values with Urban

for region in greaterRegions:
    one_hot_df2 = one_hot_df2.replace([region], "Urban")

# Replace "Rest of..." values with Rural

for region in restOfRegions:
    one_hot_df2 = one_hot_df2.replace([region], "Rural")
    
# Replace "Capital" regions with Urban

for region in capitalRegions:
    one_hot_df2 = one_hot_df2.replace([region], "Urban")
    
# Replace NaN values with "Unknown"

one_hot_df2["GCCSA_NAME"] = one_hot_df2["GCCSA_NAME"].replace(np.nan, "Unknown")

# Perform One-hot Encoding on GCCSA_NAME

one_hot_region = pd.get_dummies(one_hot_df2["GCCSA_NAME"], prefix="GCCSA")
one_hot_df2 = pd.merge(one_hot_df2, one_hot_region, on="membership_number", how="left")

In [None]:
# Convert any UINT8 columns to Bool

column_names = one_hot_df2.select_dtypes(include=[np.uint8]).columns
one_hot_df2[column_names] = one_hot_df2[column_names].astype(bool)


In [None]:
# Drop unnecessary columns

one_hot_df2 = one_hot_df2.drop(columns=["SA4", "SA4_NAME", "GCCSA_NAME", "STATE_NAME", "u_gender"])

In [None]:
# Display Summary information for one_hot_df2

one_hot_df2.info()
one_hot_df2.head(100)

In [None]:
# Manual one-hot encode subtraction:

df = one_hot_df2
df['subtraction'] = df['subtraction'].astype(int)
df['not_fully_funded'] = df['subtraction'].ge(500)
df['acceptable_funded'] = df['subtraction'].lt(500)
df


In [None]:
#One hot encode u_disabilities
df = pd.get_dummies(df, prefix=['u_dis'], columns=['u_disabilities'])

# MODELING: CLUSTERING

In [None]:
#Pre processing for clustering
df_og = df_for_cluster
df_og.info()

In [None]:
temp_df2.info()

In [None]:
#for getting the avg of days from start to end
df_og['start_date'] = pd.to_datetime(df_og['start_date'])
df_og['funded_date'] = pd.to_datetime(df_og['funded_date'])
df_og['days_between_start_funded'] = (df_og['funded_date'] - df_og['start_date']).dt.days


In [None]:
#Groupby member_id 

df_clus = df_og.groupby(["membership_number"]).agg({'days_between_start_funded': 'mean'}).reset_index()

df_clus = df_clus.dropna(subset=['days_between_start_funded'])
df_clus.info()
df_clus


In [None]:
final_df1 = pd.merge(temp_df2, df_clus, on="membership_number", how="left")
final_df1.info()

In [None]:
import warnings
warnings.filterwarnings('ignore')
#Mapping Regions
# Compile list of Greater regions

greaterRegions = final_df1[final_df1["GCCSA_NAME"].str.contains("Greater", na=False)]["GCCSA_NAME"].value_counts().index.to_list()

# Compile list of "Rest of..." regions

restOfRegions = final_df1[final_df1["GCCSA_NAME"].str.contains("Rest of", na=False)]["GCCSA_NAME"].value_counts().index.to_list()

# Compile list of "Capital" regions

capitalRegions = final_df1[final_df1["GCCSA_NAME"].str.contains("Capital", na=False)]["GCCSA_NAME"].value_counts().index.to_list()

# Replace "Greater" values with Urban

for region in greaterRegions:
    final_df1 = final_df1.replace([region], "Urban")

# Replace "Rest of..." values with Rural

for region in restOfRegions:
    final_df1 = final_df1.replace([region], "Rural")
    
# Replace "Capital" regions with Urban

for region in capitalRegions:
    final_df1 = final_df1.replace([region], "Urban")
    
# Replace NaN values with "Unknown"

final_df1["GCCSA_NAME"] = final_df1["GCCSA_NAME"].replace(np.nan, "Unknown")

final_df1.info()

In [None]:
#Replacing null values by mean
final_df1['days_between_start_funded']= final_df1["days_between_start_funded"].replace(np.nan, final_df1['days_between_start_funded'].mean())
final_df1.info()

In [None]:
#Dropping unnecessary columns
final_df1 = final_df1.drop(columns=["membership_number", "SA4", "SA4_NAME", "STATE_NAME", "member_key", "start_date", "end_date", "plan_key", "spending_ratio" , "spent", "under_spent", "over_spent", "par_spent"])
final_df1.info()
#For association saving the dataframe
final_df = final_df1.copy()

In [None]:
final_df1['GCCSA_NAME'].unique()
#Maping the values of regions
GCCSA_NAME_map = {"Unknown":3, "Urban": 1, "Rural": 2}
final_df1['GCCSA_NAME'] = final_df1['GCCSA_NAME'].map(GCCSA_NAME_map)
#final_df1.info()
final_df1['GCCSA_NAME'].value_counts()

In [None]:
final_df1['u_gender'].unique()
#Mapping
u_gender_map = {"Male":1, "Female": 2, "Other": 3, "Unknown":4 }
final_df1['u_gender'] = final_df1['u_gender'].map(u_gender_map)
#final_df1.info()
final_df1['u_gender'].value_counts()

In [None]:
final_df1['u_disabilities'].unique()
#Mapping
u_disabilities_map = {"others":1, "Intelectual": 2 }
final_df1['u_disabilities'] = final_df1['u_disabilities'].map(u_disabilities_map)
#final_df1.info()
final_df1['u_disabilities'].value_counts()

In [None]:
final_df1.info()
final_df1.head(10)

In [None]:
# convert df to matrix
X = final_df1.to_numpy()
# scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
#Clustering k-prototypes
rs= 42
clusters = []
cost_vals = []

for k in range(2, 10, 2):
    # train clustering with the specified K
    model_clus = KPrototypes(n_clusters=k, random_state=rs, n_jobs=10)
    model_clus.fit_predict(X, categorical=[1,2,3])
    # append model to cluster list
    clusters.append(model_clus)
    cost_vals.append(model_clus.cost_)

In [None]:
# plot the cost vs K values
plt.plot(range(2,10,2), cost_vals, marker='*')
plt.show()

In [None]:
#Calculating Silhouette for K=4 K=6, K=8
X_num = [[row[0], row[4], row[5],row[6]] for row in X] # Variables of X with numeric datatype
X_cat = [[row[1], row[2], row[3]] for row in X] # variables of X with categorical datatype

In [None]:
#Just for reference. 
model = clusters[0] # cluster[0] holds the K-prtotypes model with K=2
silScoreNums = silhouette_score(X_num, model.fit_predict(X,categorical=[1,2,3]), metric='euclidean')
silScoreCats = silhouette_score(X_cat, model.fit_predict(X,categorical=[1,2,3]), metric='hamming')
silScore = (silScoreNums + silScoreCats) / 2
print("The avg Silhouette score for k=2: " + str(silScore))

model = clusters[1] # cluster[1] holds the K-prtotypes model with K=4

# Calculate the Silhouette Score for the numeric and categorical variables seperately
silScoreNums = silhouette_score(X_num, model.fit_predict(X,categorical=[1,2,3]), metric='euclidean')
#print("Silscore for numeric variables: " + str(silScoreNums))
silScoreCats = silhouette_score(X_cat, model.fit_predict(X,categorical=[1,2,3]), metric='hamming') # note the metric here defined to `hamming`
#print("Silscore for categorical variables: " + str(silScoreCats))

# Average the silhouette scores
silScore = (silScoreNums + silScoreCats) / 2
print("The avg silhouette score for k=4: " + str(silScore))

model = clusters[2]
silScoreNums = silhouette_score(X_num, model.fit_predict(X,categorical=[1,2,3]), metric='euclidean')
silScoreCats = silhouette_score(X_cat, model.fit_predict(X,categorical=[1,2,3]), metric='hamming')
silScore = (silScoreNums + silScoreCats) / 2
print("The avg Silhouette score for k=6: " + str(silScore))

model = clusters[3]
silScoreNums = silhouette_score(X_num, model.fit_predict(X,categorical=[1,2,3]), metric='euclidean')
silScoreCats = silhouette_score(X_cat, model.fit_predict(X,categorical=[1,2,3]), metric='hamming') 
silScore = (silScoreNums + silScoreCats) / 2
print("The avg Silhouette score for k=8: " + str(silScore))

# Clustering Visualisation

In [None]:
#Choosing K=2 from above and plotting pairplot. Can be modified (K=4/6/8) as needed.
model = clusters[0] #cluster[0] is for K=2
y=model.fit_predict(X, categorical=[1,2,3]) 
final_df1['Cluster_ID'] = y
sns.color_palette("vlag", as_cmap=True)
# how many records are in each cluster
print("Cluster membership")
print(final_df1['Cluster_ID'].value_counts())
# pairplot the cluster distribution.
cluster_g = sns.pairplot(final_df1, hue='Cluster_ID',diag_kind='hist',palette='Dark2')
plt.show()


In [None]:
#Choosing K=4 from above and plotting pairplot. Can be modified (K=4/6/8) as needed.
model = clusters[1] 
y=model.fit_predict(X, categorical=[1,2,3]) 
final_df1['Cluster_ID'] = y
sns.color_palette("vlag", as_cmap=True)
# how many records are in each cluster
print("Cluster membership")
print(final_df1['Cluster_ID'].value_counts())
# pairplot the cluster distribution.
cluster_g = sns.pairplot(final_df1, hue='Cluster_ID',diag_kind='hist',palette='Dark2')
plt.show()

In [None]:
# Distribution for each cluster
cols = ['subtraction', 'allocation', 'remaining', 'u_gender', 'GCCSA_NAME', 'u_disabilities', 'days_between_start_funded']
n_bins = 20

clusters_to_inspect = [0,1,2,3]

for cluster in clusters_to_inspect:
    print("Distribution for cluster {}".format(cluster))
    fig, ax = plt.subplots(nrows=7, figsize=(15,15))
    ax[0].set_title("Cluster {}".format(cluster))

    for j, col in enumerate(cols):
        bins = np.linspace(min(final_df1[col]), max(final_df1[col]), 20)
        sns.distplot(final_df1[final_df1['Cluster_ID'] == cluster][col], bins=bins, ax=ax[j], norm_hist=True, kde_kws={'bw':1.5})
        sns.distplot(final_df1[col], bins=bins, ax=ax[j], hist=False, color="k")
        
    plt.subplots_adjust(bottom=0.1, right=1.8, top=4.5, wspace=0.8, hspace=0.8)
    plt.tight_layout()
    plt.show()

# Association Mining

In [None]:
#using the dataframe of clustering before the mapping

#final_df.info()
#final_df.head(10)


In [None]:
### All Claim details

df_all_claims = pd.read_sql("select c.id as claim_id, invoice_id as invoiceId, c.item_category_level3_id from HH_claim c", con=conn)

#print(df_all_claims.info())
#print(df_all_claims)
### All item_categories details.

##  
df_all_ndis_service_cat = pd.read_sql("select item_category_level3_id, ndis.registration_group from hedgehog_ndis_service_item_ref ndis", con=conn)
#print(df_all_ndis_service_cat.info())
#print(df_all_ndis_service_cat)

##Combine df_all_claims and df_all_ndis_service_cat
df_claimsWithProvider_details = pd.merge(df_all_claims, df_all_ndis_service_cat, on="item_category_level3_id", how= 'inner')
df_claimsWithProvider_details= df_claimsWithProvider_details.drop(columns=['item_category_level3_id'])
#df_claimsWithProvider_details = pd.read_sql("select c.id, c.invoice_id as invoiceId, n.registration_group from HH_claim c left join hedgehog_ndis_service_item_ref n on c.item_category_level3_id = n.item_category_level3_id;", con=conn)
#print(df_claimsWithProvider_details.info())
#print(df_claimsWithProvider_details)

### Provider Services details (Provider account linked to invoice and HH_provider) took 1 minute to execute
##  
df_providersWithInvoice_details = pd.read_sql("select i.id as invoiceId, i.member_id from HH_invoice i;", con=conn)
#print(df_providersWithInvoice_details.info())
#print(df_providersWithInvoice_details)

##Combine df_claim_providers_details and df_providersWithInvoice_details
df_claims_provider_details= pd.merge(df_claimsWithProvider_details, df_providersWithInvoice_details, on="invoiceId", how="left")

##################################################

#Final claim and Provider df
print(df_claims_provider_details.info())
print(df_claims_provider_details)


In [None]:

df_u = pd.read_sql("select id as member_id, membership_number from HH_member m;", con=conn)
df_u.info()
df_u

In [None]:
#merge 
final_df_a = pd.merge(df_claims_provider_details, df_u, on= "member_id", how="left")
final_df_a= final_df_a.drop(columns=['claim_id', 'invoiceId', 'member_id'])
final_df_a.info()
final_df_a

In [None]:
# Prep for Association model
# group by member num, then list all registration group
services = final_df_a.groupby(['membership_number'])['registration_group'].apply(list)

print(services.head(5))

In [None]:
 '''(1) If you are interested in generating associations that involve fairly rare services,
 you should consider reducing min_support. (2) If the items present in the dataset do not show high support, 
 'min_support' threshold should be set to small value and vice-versa. (3) If you obtain too many rules to be practically 
 useful, you should consider increasing min_suport and min_confidence as a possible solution'''

In [None]:

# type cast the services from pandas into normal list format and run apriori
services_list = list(services)
results = list(apriori(services_list, min_support=0.05))

# print first 5 rules
print(results[:5])

In [None]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                         rule_set.support, rule.confidence, rule.lift]) 
    
    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 
                                        'Confidence', 'Lift']) 

result_df = convert_apriori_results_to_pandas_df(results)

print(result_df.head(20))

In [None]:
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Lift', ascending=False)
print(result_df.head(20))

### MODELING: NEURAL NETWORK


In [None]:

# Drop columns with unique values.
df = df.drop(columns=['spending_ratio',"plan_key", "member_key","start_date", "end_date", 'subtraction', 'allocation', 'remaining', 'spent'])
# Summary information for df
df.info()
df.head(100)


In [None]:
# Check out model persistence where the trained model's weight was stored and loaded
# To uncomment this code if you wish to train the model again or any changes subjected to the dataset
'''
# Specify the target column for analysis
target_column = "under_spent"
# Create the input df
input_df = df.drop([target_column], axis=1)
# Create a target df
target_df = df[target_column]
# Set random state
random_state = 10
# Set test size
test_size = 0.3
# Nump-ify input_df2
input_df_mat = input_df.to_numpy()
# Split training and test data
input_df_train, input_df_test, target_df_train, target_df_test = train_test_split(input_df_mat, target_df, test_size=test_size, stratify=target_df, random_state=random_state)
# Get standard scaler
scaler = StandardScaler()
# Transform training and test data
input_df_train = scaler.fit_transform(input_df_train, target_df_train)
input_df_test = scaler.transform(input_df_test)
# Generate a prediction
model_1 = MLPClassifier(random_state=random_state)
model_1.fit(input_df_train, target_df_train)
target_prediction = model_1.predict(input_df_test)
'''

In [None]:
# Summary Information
print("Classification Report: \n", classification_report(target_df_test, target_prediction))
print("Train Accuracy: ", model_1.score(input_df_train, target_df_train))
print("Test Accuracy: ", model_1.score(input_df_test, target_df_test))
print("Default Model Characteristics: ", model_1)

In [None]:
# Improved Model - Neural Network tuned with GridSearchCV
hiddenLayerSizes = [(2,), (3, ), (4, ), (5, ), (6, ), (7, )]

alpha = [0.01, 0.001, 0.0001, 0.00001]

params = {'hidden_layer_sizes': hiddenLayerSizes, 'alpha': alpha}

model_2 = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=random_state), cv=10, n_jobs=-1)

model_2.fit(input_df_train, target_df_train)

target_prediction = model_2.predict(input_df_test)
# Summary Information
print("Classification Report: \n", classification_report(target_df_test, target_prediction))
print("Train Accuracy: ", model_2.score(input_df_train, target_df_train))
print("Test Accuracy: ", model_2.score(input_df_test, target_df_test))
print("Tuned Model Characteristics: ", model_2)
print("Best Parameters: \n", model_2.best_params_)

### MODEL PERSISTENCE

In [None]:
import pickle

#Save model weight:

filename = 'finalized_model.sav'
pickle.dump(model_1, open(filename, 'wb'))


In [None]:
# some time later...
 
# load the model from disk

loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(input_df_test, target_df_test)
print(result)

In [None]:
'''
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
array = [[33,2,0,0,0,0,0,0,0,1,3], 
        [3,31,0,0,0,0,0,0,0,0,0], 
        [0,4,41,0,0,0,0,0,0,0,1], 
        [0,1,0,30,0,6,0,0,0,0,1], 
        [0,0,0,0,38,10,0,0,0,0,0], 
        [0,0,0,3,1,39,0,0,0,0,4], 
        [0,2,2,0,4,1,31,0,0,0,2],
        [0,1,0,0,0,0,0,36,0,2,0], 
        [0,0,0,0,0,0,1,5,37,5,1], 
        [3,0,0,0,0,0,0,0,0,39,0], 
        [0,0,0,0,0,0,0,0,0,0,38]]
df_cm = pd.DataFrame(array, index = [i for i in "ABCDEFGHIJK"],
                  columns = [i for i in "ABCDEFGHIJK"])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)

'''