# IFN711 - Budget Analysis for Leap in!

## Install the dependencies stored in *requirments.txt*. 

In [None]:
!pip install -r requirements.txt

## Import statements

In [None]:
import pandas as pd
import pymysql
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from apyori import apriori
# Imports for the neural network
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.linear_model import LogisticRegression
# Load the .env file for database credentials
from dotenv import load_dotenv

## Sign into the Database using credentials either stored in .env file, or **manual input**.

In [None]:
# Declare variables
host = None
port = None
db = None
user = None
password = None
# Load the .env file to access database credentials
if load_dotenv():
    # Assign required values to variables
    host=os.getenv("host")
    port=int(os.getenv("port"))
    db=os.getenv("dbname")
    user=os.getenv("user")
    password=os.getenv("password")
    # Confirm loading of credentials
    print("Database Credentials Loaded Successfully.")
# If unable to load env file, take manual input.
else:
    # Confirm failure of loading of credentials
    print("Unable to detect Database Credentials. Please enter credentials manually.\n")
    # Request manual entry of credentials
    host = input("\nPlease enter host address: ")
    user = input("\nPlease enter username: ")
    password = input("\nPlease enter password: ")
    port = input("\nPlease enter port number: ")
    db = input("\nPlease enter database name: ")
    # Convert port to int
    port = int(port)
####### Connection to Client Database #######
conn = pymysql.connect(host=host, user=user, port=port, password=password, db=db)
# Print connection confirmation
print(conn)

## Make a dataframe of customer/user information.

In [None]:
df_user= pd.read_sql("select m.id as member_id, m.membership_number, m.status, m.price_zone_code, m.member_key, s.u_ndis_number, s.u_disabilities, s.u_gender, s.u_date_of_birth, r.SA1, r.SA2, r.SA3,r.SA4  from SNOW_csm_consumer_user s left join  HH_member m  on s.u_ndis_number = m.membership_number left join libe_leapinprod_memberregion r on r.MemberId = s.u_leapin_id where s.u_stage = 'li_managed' and s.u_ndis_number is not null;", con=conn)

# Display summary information of df_user          
df_user.info()
df_user.head(100)

## Make a dataframe of claims information.

In [None]:
df_claims= pd.read_sql("SELECT c.id as claim_id, c.invoice_id, c.state, c.risk_level, c.start_date FROM HH_claim c;", con=conn)

# Display summary information of df_user
df_claims.info()
df_claims.head(100)

 ## Make a dataframe of Invoice information.

In [None]:
df_invoices= pd.read_sql("SELECT i.id as invoice_id, i.member_id, i.invoice_total, i.funded_total, i.funded_date FROM HH_invoice i;", con=conn)

# Display summary information of df_invoices          
df_invoices.info()
df_invoices.head(100)

## Merge df_invoice and df_claims.

In [None]:
df_invoice_claim = pd.merge(df_claims, df_invoices, on="invoice_id", how="left")

# Display summary information of df_invoice_claim
df_invoice_claim.info()
df_invoice_claim.head(100)

## Merge df_invoice_claim with df_user on member_id.

In [None]:
df = pd.merge(df_invoice_claim, df_user, on="member_id", how="left")

# Summary information for df.
df.head(100)

## Make a copy of df for the clustering model.

In [None]:
df_for_cluster = df.copy()

## Aggregate invoice_total and funded_total values by summing them. 

In [None]:
df1 = df.groupby(["membership_number"]).agg({'invoice_total': 'sum', 'funded_total': 'sum'}).reset_index()

## Subtract the funded_total from invoice_total to see the extent to which each customer gets their reimbursements.

In [None]:
# Create a "subtraction" column
df1['subtraction'] = df1['invoice_total'] - df1['funded_total']

In [None]:
# Summary information of df1
df1.info()
df1.head(100)

## Merge the modified invoice-claim df with the user df.

In [None]:
df2 = pd.merge(df1, df_user, on="membership_number", how="left")

In [None]:
# Summary information of df2.
df2.info()
df2.head(20)

## Create a subset of df2 containing specific columns of interest.

In [None]:
cols_of_interest = {'membership_number', 'invoice_total', 'funded_total', 'subtraction', 'u_disabilities', 'u_gender', 'price_zone_code', 'status'}

df2 = df2[cols_of_interest]

In [None]:
# Summary information of df2.
df2.info()
df2.head(100)

# Data Cleaning of df2.

## *u_disabilities*

## 1. Replace blank values with "others".

In [None]:
df2['u_disabilities'] = df2['u_disabilities'].replace([''],'others')

## 2. Aggregate all non "other"-values to "Intellectual".

In [None]:
df2['u_disabilities'].mask(df2['u_disabilities'] != 'others', "Intellectual", inplace = True)

In [None]:
# Summary information of disabilities.
df2['u_disabilities'].value_counts()

## REMOVE: Create two subsets of data based on the amounts reimbursed.

In [None]:
# Variables for threshold.
# ! DO WE NEED THIS?
sub_0_threshold = 1000
sub_1_threshold = 10000

# Create 2 subsets. 
sub_0 = df2[df2['subtraction']> sub_0_threshold]

sub_1 = df2[df2['subtraction']> sub_1_threshold]

## *u_gender* - Replace blank values with "Unknown".

In [None]:
cleaned_df2 = df2

cleaned_df2["u_gender"] = cleaned_df2["u_gender"].replace([""],"Unknown")

## *status* - Drop column.

In [None]:

cleaned_df2 = cleaned_df2.drop(columns=["status"])

## *price_zone_code* - Drop column.

In [None]:
cleaned_df2 = cleaned_df2.drop(columns=["price_zone_code"])

## Replace *price_zone_code* with SA4 information.

In [None]:
df_user_min = df_user[["membership_number", "SA4"]]
# Remove rows with blank SA1 - SA4 values
blankIndices = df_user_min[df_user_min["SA4"] == 0.0].index
df_user_min = df_user_min.drop(axis=0, labels=blankIndices)

In [None]:
# Drop rows with null SA1-4 values 
df_user_min = df_user_min.dropna()
# Drop rows with duplicate membership_number values 
df_user_min = df_user_min.drop_duplicates()

In [None]:
# Obtain geographical locations using SA4 table dump.
df_sa4 = pd.read_csv("./TableDump/SA4_2016.csv",sep='\t')

In [None]:
# Split values into columns.
df_sa4[['SA4_CODE_2016','SA4_NAME_2016','GCCSA_CODE_2016','GCCSA_NAME_2016','STATE_CODE_2016','STATE_NAME_2016','AREA_ALBERS_SQKM']] = df_sa4['SA4_CODE_2016,SA4_NAME_2016,GCCSA_CODE_2016,GCCSA_NAME_2016,STATE_CODE_2016,STATE_NAME_2016,AREA_ALBERS_SQKM'].str.split(',',expand=True)

In [None]:
# Extract relevant columns
df_sa4 = df_sa4[["SA4_CODE_2016", "SA4_NAME_2016", "GCCSA_NAME_2016", "STATE_NAME_2016"]]
# Rename columns to match df2
name_mapping = {
    "SA4_CODE_2016": "SA4",
    "SA4_NAME_2016": "SA4_NAME",
    "GCCSA_NAME_2016": "GCCSA_NAME",
    "STATE_NAME_2016": "STATE_NAME"
}
df_sa4 = df_sa4.rename(columns=name_mapping)
df_sa4

In [None]:
#Set float for SA4:
df_sa4['SA4'] = df_sa4['SA4'].astype(float, errors = 'raise')

In [None]:
# Summary information of df_sa4
df_sa4.info()
df_sa4

In [None]:
# Merge SA4 information with the user DF.
df_user_min = pd.merge(df_user_min, df_sa4, on= "SA4", how="left")
# Merge the final user info with DF2
cleaned_df2 = pd.merge(cleaned_df2, df_user_min, on="membership_number", how="left")

## Add the *member_key* column to facilitate invoice-related calculations.  

In [None]:
cleaned_df2 = pd.merge(cleaned_df2, df_user[["membership_number", "member_key"]], on="membership_number", how="left")
# Drop duplicated member keys
cleaned_df2 = cleaned_df2.drop_duplicates()

In [None]:
# Summary information for cleaned_df2.
cleaned_df2.info()
cleaned_df2.head(100)

## Get information on Plans with status of "completed".

In [None]:
# Get Completed Plans
df_plan = pd.read_sql("select p.plan_key, p.member_key, p.status, p.start_date, p.end_date, pb.item_category_level2_key, pb.allocation, pb.remaining from HH_plan p join HH_plan_budget pb on p.plan_key = pb.plan_key where p.status = 'COMPLETED'", con=conn)

In [None]:
# Convert start and end_date to datetime
df_plan["start_date"] = pd.to_datetime(df_plan["start_date"], format="%Y-%m-%d")
df_plan["end_date"] = pd.to_datetime(df_plan["end_date"], format="%Y-%m-%d")

In [None]:
# Summary info for df_plan
df_plan.info()
df_plan.head(100)

In [None]:
df_plan2 = df_plan
## CHECK IF THIS IS NEEDED.
# # Convert start and end_date to datetime
# df_plan2["start_date"] = pd.to_datetime(df_plan2["start_date"], format="%Y-%m-%d")
# df_plan2["end_date"] = pd.to_datetime(df_plan2["end_date"], format="%Y-%m-%d")


## Group plans by *plan_key*, and then aggregate values as necessary.

In [None]:
df_grouped_plan2 = df_plan2.groupby(["plan_key"]).agg({"allocation": "sum", "remaining": "sum", "member_key": "first", "start_date": "first", "end_date": "first"}).reset_index()

# Sort grouped df by start and end dates in descending order
df_grouped_plan2 = df_grouped_plan2.sort_values(["start_date", "end_date"], ascending=[False, False])

# Eliminate duplicate member keys by dropping all rows but the most recent ones
df_grouped_plan2 = df_grouped_plan2.groupby(["member_key"]).agg({"start_date": "first", "end_date": "first", "plan_key": "first", "allocation": "first", "remaining": "first"}).reset_index()

In [None]:
# Summary information for df_grouped_plan2
df_grouped_plan2.info()
df_grouped_plan2.head(100)

## Merge plan information with cleaned_df2

In [None]:
temp_df2 = pd.merge(cleaned_df2, df_grouped_plan2, on="member_key", how="left")

# Drop members without completed plans
temp_df2 = temp_df2.dropna(subset=["plan_key"])


## Calculate the ratio of money spent to money allocated.

In [None]:
temp_df2["spending_ratio"] = (temp_df2["allocation"] - temp_df2["remaining"]) / temp_df2["allocation"]

temp_df2["spent"] = temp_df2["allocation"] - temp_df2["remaining"]

## Define columns *under_spent*, *over_spent* and *par_spent* in accordance with the defined thresholds. 

In [None]:
# Variables defining under and appropriate spending ratio thresholds.                                                      
under_spend_thres = 0.75
par_spend_thres = 1.0

temp_df2["under_spent"] = temp_df2["spending_ratio"] <= under_spend_thres
temp_df2["over_spent"] = temp_df2["spending_ratio"] > par_spend_thres
temp_df2["par_spent"] = temp_df2["spending_ratio"] == par_spend_thres

In [None]:
# Drop outdated columns
temp_df2 = temp_df2.drop(columns=["invoice_total", "funded_total"])

In [None]:
# Summary information for temp_df2
temp_df2.info()
temp_df2.head(100)
temp_df2["under_spent"].value_counts()

# One-Hot Encoding of DF2

In [None]:
one_hot_df2 = temp_df2.set_index("membership_number")

## One-hot encoding of *u_gender*.

In [None]:
one_hot_gender = pd.get_dummies(one_hot_df2["u_gender"], prefix="gender")

In [None]:
# Merge with the cleaned DF2
one_hot_df2 = pd.merge(one_hot_df2, one_hot_gender, on="membership_number", how="left")

## One-hot encoding of *GCCSA_NAME*.

In [None]:
# Compile list of Greater regions
greaterRegions = one_hot_df2[one_hot_df2["GCCSA_NAME"].str.contains("Greater", na=False)]["GCCSA_NAME"].value_counts().index.to_list()

# Compile list of "Rest of..." regions
restOfRegions = one_hot_df2[one_hot_df2["GCCSA_NAME"].str.contains("Rest of", na=False)]["GCCSA_NAME"].value_counts().index.to_list()

# Compile list of "Capital" regions
capitalRegions = one_hot_df2[one_hot_df2["GCCSA_NAME"].str.contains("Capital", na=False)]["GCCSA_NAME"].value_counts().index.to_list()

# Replace "Greater" values with Urban
for region in greaterRegions:
    one_hot_df2 = one_hot_df2.replace([region], "Urban")

# Replace "Rest of..." values with Rural
for region in restOfRegions:
    one_hot_df2 = one_hot_df2.replace([region], "Rural")
    
# Replace "Capital" regions with Urban
for region in capitalRegions:
    one_hot_df2 = one_hot_df2.replace([region], "Urban")
    
# Replace NaN values with "Unknown"
one_hot_df2["GCCSA_NAME"] = one_hot_df2["GCCSA_NAME"].replace(np.nan, "Unknown")

# Perform One-hot Encoding on GCCSA_NAME
one_hot_region = pd.get_dummies(one_hot_df2["GCCSA_NAME"], prefix="GCCSA")
one_hot_df2 = pd.merge(one_hot_df2, one_hot_region, on="membership_number", how="left")

## Convert any *UINT8*-formatted columns to *Bool*.

In [None]:
column_names = one_hot_df2.select_dtypes(include=[np.uint8]).columns
one_hot_df2[column_names] = one_hot_df2[column_names].astype(bool)

In [None]:
# Drop unnecessary columns
one_hot_df2 = one_hot_df2.drop(columns=["SA4", "SA4_NAME", "GCCSA_NAME", "STATE_NAME", "u_gender"])

In [None]:
# Display Summary information for one_hot_df2
one_hot_df2.info()
one_hot_df2.head(100)

## One-Hot Encoding of *subtraction*


In [None]:
# Assumed threshold for determining over/under-spending
funded_threshold = 500

df = one_hot_df2

df['subtraction'] = df['subtraction'].astype(int)
df['not_fully_funded'] = df['subtraction'].ge(funded_threshold)
df['acceptable_funded'] = df['subtraction'].lt(funded_threshold)

## One-Hot Encoding of *u_disabilities*


In [None]:
df = pd.get_dummies(df, prefix=['u_dis'], columns=['u_disabilities'])

# Model - Clustering

In [None]:
#Pre processing for clustering
df_og = df_for_cluster

## Get the average number of days from start_date to funded_date

In [None]:
df_og['start_date'] = pd.to_datetime(df_og['start_date'])
df_og['funded_date'] = pd.to_datetime(df_og['funded_date'])
df_og['days_between_start_funded'] = (df_og['funded_date'] - df_og['start_date']).dt.days

In [None]:
#Group by member_id 
df_clus = df_og.groupby(["membership_number"]).agg({'days_between_start_funded': 'mean'}).reset_index()
# Drop days_between_start_funded
df_clus = df_clus.dropna(subset=['days_between_start_funded'])

In [None]:
# Summary information for df_clus
df_clus.info()
df_clus

In [None]:
# Some Stuff here.

In [None]:
final_df1 = pd.merge(temp_df2, df_clus, on="membership_number", how="left")
final_df1.info()

# Model - Association Mining

# Model - Neural Network


## Drop columns with unique values.


In [None]:
df = df.drop(columns=['spending_ratio',"plan_key", "member_key","start_date", "end_date", 'subtraction', 'allocation', 'remaining', 'spent'])

In [None]:
# Summary information for df
df.info()
df.head(100)

## Code for generating the Neural Network Model.

### *Uncomment this code if you wish to train the model again or if there are any changes in the dataset.*

In [None]:
'''
# Specify the target column for analysis
target_column = "under_spent"
# Create the input df
input_df = df.drop([target_column], axis=1)
# Create a target df
target_df = df[target_column]
# Set random state
random_state = 10
# Set test size
test_size = 0.3
# Nump-ify input_df2
input_df_mat = input_df.to_numpy()
# Split training and test data
input_df_train, input_df_test, target_df_train, target_df_test = train_test_split(input_df_mat, target_df, test_size=test_size, stratify=target_df, random_state=random_state)
# Get standard scaler
scaler = StandardScaler()
# Transform training and test data
input_df_train = scaler.fit_transform(input_df_train, target_df_train)
input_df_test = scaler.transform(input_df_test)
# Generate a prediction
model_1 = MLPClassifier(random_state=random_state)
model_1.fit(input_df_train, target_df_train)
target_prediction = model_1.predict(input_df_test)
'''


In [None]:
# Summary Information of the model.
print("Classification Report: \n", classification_report(target_df_test, target_prediction))
print("Train Accuracy: ", model_1.score(input_df_train, target_df_train))
print("Test Accuracy: ", model_1.score(input_df_test, target_df_test))
print("Default Model Characteristics: ", model_1)

## OPTIONAL - Neural Network tuned with GridSearchCV

### *Uncomment this code if you wish to train the model again or if there are any changes in the dataset.*

In [None]:
'''
hiddenLayerSizes = [(2,), (3, ), (4, ), (5, ), (6, ), (7, )]

alpha = [0.01, 0.001, 0.0001, 0.00001]

params = {'hidden_layer_sizes': hiddenLayerSizes, 'alpha': alpha}

model_2 = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=random_state), cv=10, n_jobs=-1)

model_2.fit(input_df_train, target_df_train)

target_prediction = model_2.predict(input_df_test)

'''

In [None]:
# Summary Information of the model.
print("Classification Report: \n", classification_report(target_df_test, target_prediction))
print("Train Accuracy: ", model_2.score(input_df_train, target_df_train))
print("Test Accuracy: ", model_2.score(input_df_test, target_df_test))
print("Tuned Model Characteristics: ", model_2)
print("Best Parameters: \n", model_2.best_params_)

## Export the Neural Network Model as a *.sav* file

In [None]:
import pickle
filename = './FinalisedModels/nn_model.sav'
pickle.dump(model_1, open(filename, 'wb'))

## Tests for .sav file

In [None]:
# Load the model from disk.
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(input_df_test, target_df_test)
print(result)

In [None]:
'''
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
array = [[33,2,0,0,0,0,0,0,0,1,3], 
        [3,31,0,0,0,0,0,0,0,0,0], 
        [0,4,41,0,0,0,0,0,0,0,1], 
        [0,1,0,30,0,6,0,0,0,0,1], 
        [0,0,0,0,38,10,0,0,0,0,0], 
        [0,0,0,3,1,39,0,0,0,0,4], 
        [0,2,2,0,4,1,31,0,0,0,2],
        [0,1,0,0,0,0,0,36,0,2,0], 
        [0,0,0,0,0,0,1,5,37,5,1], 
        [3,0,0,0,0,0,0,0,0,39,0], 
        [0,0,0,0,0,0,0,0,0,0,38]]
df_cm = pd.DataFrame(array, index = [i for i in "ABCDEFGHIJK"], columns = [i for i in "ABCDEFGHIJK"])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)

'''