In [1]:
# -*- coding: utf-8 -*-
"""
@Created on Tue November 30 17:22:23 2021
@Data Warehouse & Mining Visualization
@Author: D. Perry
"""

# Aim: Which attributes of the dataset determine the Funding Agency on behalf of the contracts that were awarded?
# Approach: For this reason two(2) modules were built a Decison Tree and a Neural Network Classifications.

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
sns.set(color_codes=True)

from matplotlib import pyplot as plt
# These datasets are useful for getting a handle on a given machine learning algorithm.
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder # This line of codes below can encoding the data set individually.
from sklearn.model_selection import train_test_split

from IPython.display import clear_output

# Import Decision Tree Classifier library.
from sklearn import tree

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
import graphviz
# graphviz provides a simple pure-Python interface for the Graphviz graph-drawing software.


from IPython.display import Image, display, SVG
# Save the model as png file
from keras.utils.vis_utils import plot_model 

from sklearn.neural_network import MLPClassifier
import scipy.stats as stats
import datetime
import time 

from yellowbrick.features import Rank2D
from yellowbrick.features import JointPlotVisualizer

import pydotplus
from IPython.display import clear_output

# Standard plotly imports
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# indicates that we want our plots to be shown in our notebook and not in a sesparate v
%matplotlib inline


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os # Read input/output from directory.

In [2]:
# Function to importing Dataset 
def import_data_source(): 
    ncc_consolidated_data = pd.read_csv("ncc_consolidated.csv") 

    # Printing the dataswet shape 
    print ("\nDATASET LENGHT : ", len(ncc_consolidated_data)) 
    print ("\nDATASET SHAPE : ", ncc_consolidated_data.shape) 
    
    input("\n\t\tPress Enter to continue...")
    clear_output()
    
    # Convert string to date format.
    ncc_consolidated_data['Date'] = pd.to_datetime(ncc_consolidated_data['Date'], dayfirst=True).dt.strftime('%Y-%m-%d')

    # Convert 'Jamaican Equivalent' float values to string, which is categorial data.
    ncc_consolidated_data['Jamaican Equivalent'] = ncc_consolidated_data['Jamaican Equivalent'].astype(float)
    ncc_consolidated_data['Jamaican Equivalent'].round(decimals = 2)
                

    # Settings to display all columns
    pd.set_option('display.max_columns', None)
    # Display the dataframe head
    print ("\n\n---- DATA FRAME DATASET OBSERVATION ----")
    print ("........................................\n\n", ncc_consolidated_data.head())
    
    input("\n\t\tPress Enter to continue...")
    clear_output()
    
    print ("\n --- DATASET VARIABLE TYPES ---\n\n", ncc_consolidated_data.dtypes) 
    input("\n\t\tPress Enter to continue...")
    clear_output()
    
    return ncc_consolidated_data

# I am only interested in some of the columns.
# Funnction drop unnecessary columns.
def drop_columns(ncc_consolidated):
    data = ncc_consolidated.copy().loc[:, (ncc_consolidated.columns != 'ID') & (ncc_consolidated.columns != 'Currency Unit') 
    & (ncc_consolidated.columns != 'Contract Description') & (ncc_consolidated.columns != 'Dollar Amount') & (ncc_consolidated.columns != 'Comments') & (ncc_consolidated.columns != 'Additional Comments')  
    & (ncc_consolidated.columns != 'Column 13')]
    
    return data


# Fill in missing values with the most frequent occurrences.
def fill_missing_data(ncc_data_sample):
    ncc_data_sample = ncc_data_sample.fillna({"Procurement Method": "SURREY PAVING & AGGREGATE CO. LTD"})
    ncc_data_sample = ncc_data_sample.fillna({"Government Agency": "National Works Agency"})
    ncc_data_sample = ncc_data_sample.fillna({"Contractor": "ST"})
    ncc_data_sample = ncc_data_sample.fillna({"Fund": "GOJ"})
    
    return ncc_data_sample

# Displays out all result.
def display_all(df):
    with pd.option_context("display.max_rows", 6091, "display.max_columns", 11): 
        display(df)

# Function for encoding the data set.
def create_label_encoder_dict(input_df):    
    label_encoder_dict = {}
    for column in input_df.columns:
        # Only create encoder for categorical data types
        if not np.issubdtype(input_df[column].dtype, np.number) :
            label_encoder_dict[column]= LabelEncoder().fit(input_df[column])
            
    return label_encoder_dict


# Function to show encoding of categorical values.
def print_label_endoder(ncc_data_encoded):
    
    print("\t\t\t --- ENCODED VALUES FOR EACH LABEL ---\n")
    print("="*85)
    for column in ncc_data_encoded:
        print("*"*85)
        print('Encoder(%s) = %s' % (column, ncc_data_encoded[column].classes_ ))
        print("\n")
        print(pd.DataFrame([range(0,len(ncc_data_encoded[column].classes_))], columns=ncc_data_encoded[column].classes_, 
        index=['[******************* ENCODED VALUES SUMMARY *******************]']  ).T)
        
        input("\n\t\tPress Enter to continue here...")
        clear_output()
        
    
    
# Fucntion to transforms dataset into encoded numeric values.
def trans_data(ncc_data_model, ncc_data_encoded):
    for column in ncc_data_model.columns:
        if column in ncc_data_encoded:
            ncc_data_model[column] = ncc_data_encoded[column].transform(ncc_data_model[column])

    print("\t--- TRANSFORMED DATASET | WITH UNIQUE ENCODED VALUES ---")
    print("*"*70)
    print("*"*70)
    model_data_frame = pd.DataFrame(ncc_data_model)
    print(model_data_frame.head())
    
    return ncc_data_model


def dataColumns(ncc_data_model):
     # Separate our data into dependent (Y) and independent(X) variables to build model.
    data = ncc_data_model[['Date', 'Government Agency', 'Funding Agency', 'Contractor', 'Procurement Method', 
                           'Jamaican Equivalent']]
    return data


def axis(ncc_data_model):
     # Separate our data into dependent (Y) and independent(X) variables to build model.
    X = ncc_data_model[['Date', 'Government Agency', 'Contractor', 'Procurement Method', 'Jamaican Equivalent']]
    Y = ncc_data_model['Funding Agency']
    
    return x, y
    
# Function to split the dataset 
def splitdataset(ncc_data_model): 

    # Seperating data into features and target variables.
    # Separate our data into dependent (Y) and independent(X) variables to build model.
    X = ncc_data_model[['Date', 'Government Agency', 'Contractor', 'Procurement Method', 'Jamaican Equivalent']]
    Y = ncc_data_model['Funding Agency']
    

    # Spliting the dataset into train and test 
    X_train, X_test, y_train, y_test = train_test_split( 
    X, Y, test_size = 0.30) # 70% training and 30% test
    
    return X, Y, X_train, X_test, y_train, y_test 


#Function to show X and Y data axises. 
def print_axis(X_data, Y_data):
    print("SHOW X : \n", X_data.head())
    print("\nSHOW Y :\n", Y_data.head())

#Function to validate the percentage spit.     
def percentage_spit(X_train):
    print("DATA MODEL SPLIT : ", X_train.shape)
    print("CALCULATED FIGURE : ", 6091 * 0.7)
    
    
# Function to perform training with entropy. 
def tarin_using_entropy(X_train, y_train): 

    # Decision tree with entropy 
    clf_entropy = DecisionTreeClassifier(criterion='entropy', splitter = 'random', max_depth=3,
                  min_samples_leaf = 2, min_samples_split=2) 
    # Performing training 
    clf_entropy.fit(X_train, y_train) 
    
    return clf_entropy 


# Function to perform training with giniIndex. 
def train_using_gini(X_train, y_train): 
    
    # Decision tree with entropy 
    clf_gini = DecisionTreeClassifier(criterion='gini', splitter = 'random', max_depth=3, 
                  min_samples_split=2) 
    # Performing training 
    clf_gini.fit(X_train, y_train) 
    
    return clf_gini



# Function to show the important features in the data model.
def sig_features(clf, x_data):
    print(pd.DataFrame([ "%.2f%%" % perc for perc in (clf.feature_importances_ * 100) ], 
    index = x_data.columns, columns = ['Feature Significance in Decision Tree']))

 
 
# Function to visualize Decision Tree Classification.
def tree_visualization(ncc_data_encoded, clf_obj, X_data, Y_data):

    # Visualize data.
    tree_data = tree.export_graphviz(clf_obj,out_file=None, 
    feature_names=X_data.columns, 
    class_names=ncc_data_encoded[Y_data.name].classes_,  
    filled=True, rounded=True,  proportion=True,
    node_ids=True, #impurity=False,
    special_characters=True)
    
    return tree_data
    
    

# Function to make predictions 
def prediction(X_test, clf_object, y_test): 

    # Predicton on test with giniIndex 
    y_pred = clf_object.predict(X_test) 
    y_pred_cnt = (clf_object.predict(X_test) == y_test) 
    
    print("Predicted values : ", y_pred) 
    print("\nPredicted values count : ", y_pred_cnt.value_counts()) 
    
    return y_pred


# Function to calculate accuracy.
def cal_accuracy(y_test, y_pred): 

    score = accuracy_score(y_test,y_pred)*100
    
    print("\nCONFUSION MATRIX: \n", confusion_matrix(y_test, y_pred)) 

    print("\n\nACCURACY USING DECISION TREE : ", round(score, 2), "%")
    
    print("\n\nCLASSIFICATION REPORT : \n", classification_report(y_test, y_pred))
    
    
# Function use to plot confusion matrix.   
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    import itertools
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix :\n\n")
    else:
        print('Confusion matrix, without normalization :\n\n')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

# Transforms features by scaling each feature to a given range.
def create_min_max_scaler_dict(df):
    
    from sklearn.preprocessing import MinMaxScaler
    
    min_max_scaler_dict = {}
    for column in df.columns:
        # Only create encoder for categorical data types
        if np.issubdtype(df[column].dtype, np.number):
            min_max_scaler_dict[column]= MinMaxScaler().fit(pd.DataFrame(df[column]))
            
    return min_max_scaler_dict


# Function for pandas dataframe columns scaling.
def scalers_dataframe(dataset):

    scalers_df =pd.DataFrame([
    {
        'column':col,
        'min':min_max_scalers[col].dataset_min_[0],
        'max':min_max_scalers[col].dataset_max_[0],
        'range':min_max_scalers[col].dataset_range_[0] 
    } for col in min_max_scalers])
    
    scalers_df 
#-------------------------------------- Function ends. ------------------------------------------#

In [3]:
# Menu display options.

def main_option_menu():
    print("\n\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("\n\t\t~~                                                        ~~")
    print("\n\t\t~~      DATA WAREHOUSE & DATA MINING VISUALIZATION        ~~")
    print("\n\t\t~~                      (DWDM)                            ~~")
    print("\n\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("\n\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("\n\t\t~~                                                        ~~")
    print("\n\t\t~~          ->0. EXIT                                     ~~")
    print("\n\t\t~~          ->1. INITIALIZE DATASET                       ~~")
    print("\n\t\t~~                                                        ~~")
    print("\n\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
#----------------------------------------------------------------------------------------------------------------------#
   
    
def sub_menu():
    print("\n\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("\n\t\t~~                                                        ~~")
    print("\n\t\t~~      DATA WAREHOUSE & DATA MINING VISUALIZATION        ~~")
    print("\n\t\t~~               (BUILD DATA MODEL)                       ~~")
    print("\n\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("\n\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("\n\t\t~~                                                        ~~")
    print("\n\t\t~~          ->0. RETURN TO MAIN                           ~~")
    print("\n\t\t~~          ->1. DECISION TREE CLASSIFICATION             ~~")
    print("\n\t\t~~          ->2. NEURAL NETWORK CLASSIFICATION            ~~")
    print("\n\t\t~~          ->3. DATASET VISUALIZATIONS                   ~~")
    print("\n\t\t~~                                                        ~~")
    print("\n\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
#----------------------------------------------------------------------------------------------------------------------#

In [4]:
# Function main application menu.
def app_menu():      
    try:
        clear_output()
        loop = True  # Condition Initialis    
        
        while loop:  # While loop which will keep going until loop condition is False
            main_option_menu()
            
            print('\n\n')
            # Accept user input for option here.
            choice = str(input("\t\tEnter your choice [0 & 1]: "))
            clear_output()
            
           #................................................................................................................#
            if choice=='0':  
                clear_output()
                print ("\n\n\t\t[--- APPLICATION CLOSED ---]")
                time.sleep(3)
                loop=False # This will make the while loop to end as not value of loop is set to False
           #................................................................................................................#
            elif choice=='1':  
                print ("\n\n[---  INITIALIZE DATASET   ---]")
                # Load and store dataset to variable.
                ncc_consolidated = import_data_source()

                # Strip leading and trailing space from dataset.
                ncc_consolidated['Fund'] = ncc_consolidated['Fund'].str.strip()
                ncc_consolidated['Government Agency'] = ncc_consolidated['Government Agency'].str.strip()
                ncc_consolidated['Contractor'] = ncc_consolidated['Contractor'].str.strip()
                ncc_consolidated['Procurement Method'] = ncc_consolidated['Procurement Method'].str.strip()

                # Remove multiple spaces between two strings
                ncc_consolidated = ncc_consolidated.replace('\s+', ' ', regex=True)
                ncc_consolidated['Procurement Method'] = ncc_consolidated['Procurement Method'].replace('\s+', ' ', regex=True)
                ncc_consolidated['Procurement Method'] = ncc_consolidated['Procurement Method'].str.replace(" ","")

                # Rename Procurement Method duplicate names data column values.
                ncc_consolidated['Procurement Method'].replace(['*', 'DC*','', ' '], 'DC')
                ncc_consolidated['Procurement Method'].replace(['*', 'DC*'], 'DC')

                # Rename Government Agency duplicate names data column values.
                ncc_consolidated.rename(index={'University Hospital of  the West indies': 'University Hospital of the West Indies, Mona'})
                ncc_consolidated.rename(index={'University Hospital of the West Indies': 'University Hospital of the West Indies, Mona'})
                ncc_consolidated.rename(index={'University Hospital of  the West indies (UHWI)': 'University Hospital of the West Indies, Mona'})

                # Rename Government Agency duplicate names data column values.
                ncc_consolidated['Government Agency'].replace('University of Technology, Jamaica,  Jamaica', 'University of Technology, Jamaica (UTECH)') 
                ncc_consolidated['Government Agency'].replace('University of Technology, Jamaica	', 'University of Technology, Jamaica (UTECH)') 
                ncc_consolidated['Government Agency'].replace('University of Technology,Jamaica', 'University of Technology, Jamaica (UTECH)') 
                ncc_consolidated['Government Agency'].replace('University of Technology, Jamaica', 'University of Technology, Jamaica (UTECH)') 


                # Drop unnecessary columns in dataset.
                ncc_data = drop_columns(ncc_consolidated)

                ncc_data_sample = ncc_data.copy()

                print("\n")
                ncc_data_sample.describe()

                ncc_data_sample.dtypes

                ncc_data_sample.head(6)

                # Look for missing dataset column values.
                print ("--- COUNT MISSING DATA --- \n")
                print(ncc_data_sample.isnull().sum())
                input("\n\t\tPress Enter to continue...")
                clear_output()
                
                # This method prints information about the DataFrame.
                print ("\n[----  DATA SET INFORMATION  ----]\n")
                ncc_data_sample.info()
                input("\n\t\tPress Enter to continue...")
                clear_output()

                # Fill in missing values with the most frequent occurrences.
                ncc_clean_data = fill_missing_data(ncc_data_sample)
                
                print (" --- VERIFY MISSING DATA FILLED IN --- \n")
                print(ncc_clean_data.isnull().sum())

                input("\n\t\tPress Enter to continue...")
                clear_output()
                
                # Returns the column headings to indicate the dataset that is working on.
                print ("\n\n[---- DISPLAY DATASET AFTER DROPPING UNECESSARY COLUMNS ----]")
                # display the dataframe head
                print ("[...........................................................]\n\n", ncc_clean_data.head())
                input("\n\t\tPress Enter to continue...")
                clear_output()

                # Create target variable criterion.
                ncc_clean_data["Funding Agency"]  = ncc_clean_data["Fund"].apply(lambda col_val: 'GOJ Funding' if col_val == 'GOJ' else 'Non-GOJ Funding')
                ncc_clean_data["Funding Agency"].head()

                # -------------------------------------------------#
                # Save clean dataset results to a file for future access.
                # -------------------------------------------------#
                ncc_clean_data.to_csv('ncc_consolidated_clean_file.csv', encoding='utf-8')


                # To get whole EDA (Exploratory Data Analysis) using pandas_profiling.
                #ProfileReport(ncc_clean_data)

                ncc_data_df = ncc_clean_data.copy()

                # Make copy of the sanitized data to build model.
                data_sample = ncc_clean_data.copy()
                dataset = ncc_clean_data.copy()


                # The ncc_data_sample is categorial so I convert it with LabelEncoder to transfer to ordinal.
                ncc_data_encoded = create_label_encoder_dict(ncc_clean_data)
                print('\n\n')


                # Used to convert categorical data, or text data, into numbers, which our predictive models can better understand.
                print_label_endoder(ncc_data_encoded)
                

                # Transform dataset with encoding.
                ncc_model = trans_data(data_sample, ncc_data_encoded)
                    
                # Make a copy of encoded data for Nearal Network model.
                model_data = ncc_model.copy()
                
                input("\n\t\tPress Enter to continue...")
                clear_output()
                
                # Give summary of the most frequent occurrence in the dataset.
                print('\t[-------- SUMMARY OF DATASET FREQUENCY --------]')
                top = display_all(ncc_clean_data.describe(include='all').T)
                print('\n')

                # Coontacts awarded on UTech behalf.
                filter_data = ncc_clean_data[["Date", "Government Agency", "Jamaican Equivalent", 'Funding Agency']]
                filter = filter_data["Government Agency"] == "University of Technology, Jamaica"

                # Print only filter columns
                # filter_data.where(filter).dropna()

                #................................................. Visualizations ......................................#
                # Contacts awarded on UWI behalf.
                filter_df = ncc_clean_data[["Date", "Government Agency", "Jamaican Equivalent", 'Funding Agency', 'Procurement Method']]
                filter = filter_df["Government Agency"] == "University of the West Indies"

                # Print only filter columns
                # filter_data.where(filter).dropna()

                avg = filter_data['Jamaican Equivalent'].mean() 
                # print('\n\n')
                
                # View above average contract costs awarded in 2013.
                above_avg_contracts = filter_data[(filter_data['Jamaican Equivalent'] > avg)] 
                # print("View above average contract costs awarded in 2013",above_avg_contracts)
                input("\n\t\tPress Enter to continue...")
                clear_output()
            
                # Compare UTECH VS UWI Funding.
                school = []
                for name in filter_data["Government Agency"]:
                    if name == "University of the West Indies":
                        school.append("UWI")
                    elif name == "University of Technology, Jamaica" or name == "University of Technology, Jamaica (UTECH)":
                        school.append("UTECH")
                    else:
                        school.append("Non-University")

                filter_data["University"] = school
                filter_data
                
                
                # .................................................................................................................. #
                # .................................................................................................................. #
                print("\t\t\t\t---------- DATA VISUALIZATIONS ------------")
                print("\n\t\t\t\t=========================================")
                
                # You can set bins with nbinsx and nbinsy
                # Multiple plots
                #px.violin(ncc_clean_data, y="Procurement Method", x="Jamaican Equivalent", color="Fund", box=True, points="all",
                          #hover_data=ncc_clean_data.columns)
                    
                # Now .groupby method is used to aggregate Jamaican Equivalent by date as well as sum Jamaican Equivalent per day.
                cost_by_date = ncc_clean_data.groupby('Date')['Jamaican Equivalent'].sum()
                cost_by_date.iplot(kind='scatter', title='Jamaican Equivalent JMD($) per Month')
                input("\n\t\tPress Enter to continue...")
                clear_output()

                
                # Distribution of University Funding Compare to Non-University Funding
                school_funding = filter_data["University"].value_counts()
                school_funding.iplot(kind='bar', xTitle='University vs Non-University Funding',
                                  yTitle='Frequency', title='University Funding Compares to Non-University Funding')
                input("\n\t\tPress Enter to continue...")
                clear_output()
                
                # plotting the figure
                fig = px.scatter_3d(ncc_clean_data, x="Procurement Method", y="Funding Agency", z="Jamaican Equivalent", color='Fund',
                                   title='3D Correlation between Jamaican Equivalent JMD($), Procurement Method & Funding Agency')

                fig.show()
                
                input("\n\t\tPress Enter to continue...")
                clear_output()
                
                
                # Graph showing the 10 most costly contracts that were awarded. 
                ncc_clean_data.iplot(kind="line", theme="white", x="Date", y="Jamaican Equivalent", xTitle='Year', 
                                     yTitle='Jamaican Equivalent', categories="Funding Agency", title='Jamaican Equivalent JMD($) per Day Group by Agency')
                
                input("\n\t\tPress Enter to continue...")
                clear_output()
                
                # Contracts Awarded per Quarter.
                ncc_clean_data["Date"].iplot(kind='hist', xTitle='Date',
                                  yTitle='No. of Contracts', title='No. of Contracts Awarded per Quarter')
                input("\n\t\tPress Enter to continue...")
                clear_output()

                # Procurement Method used for tendering for the contracts
                ncc_clean_data["Procurement Method"].iplot(kind='hist', xTitle='Procurement Method',
                                  yTitle='No. of Contracts', title='Procurement Method Distribution for Contracts')
                print('\n\n')

                proc_count = ncc_clean_data["Procurement Method"].value_counts()
                
                input("\n\t\tPress Enter to continue...")
                clear_output()
                
                """
                plt.figure(figsize=(16,8))
                sns.set(style="darkgrid")
                sns.barplot(proc_count.index, proc_count.values, alpha=0.5)
                plt.title("Frequency Distibution of Procurement Method\n")
                plt.xlabel("Procurement Method", fontsize=12)
                plt.ylabel("Number of Occurrences", fontsize=12)
                input("\n\t\tPress Enter to continue...")
                clear_output()
                """

                ncc_clean_data["Fund"].iplot(kind='hist', xTitle='Fund',
                                  yTitle='Contracts', title='No. of Contracts Funded by Agency')

                jmd = ncc_data_sample['Jamaican Equivalent'].astype('float')
                jmd.plot.hist(subplots=False, layout=(2,2), figsize=(16,14))

                # Top (5) cost figures awarded Government contract values.
                fig_jmd = pd.value_counts(jmd.values, sort=True).nlargest(5).plot(kind="bar",figsize=(12,7))
                plt.ylabel("Frequency")
                plt.xlabel("Jamaican Equivalent JMD($)")
                plt.title("\nTop (5) Highest for Awarded Government Contracts\n") 
                
                input("\n\t\tPress Enter to continue...")
                clear_output()


                print("\n--------- DATA VISUALIZATION CONTINUES ----------")
                print("-------------------------------------------------")

                g_agency = ncc_clean_data["Government Agency"].str.strip()

                fig1 = pd.value_counts(g_agency.values, sort=True).nlargest(10).plot(kind="bar",figsize=(12,8))
                plt.ylabel("Contracts")
                plt.xlabel("Government Agency Contract Awarded")
                plt.title("\nTen (10) Most Awarded Government Agency\n") 
                print('\n\n')

                # Top eight (8) most used contractors for Goverment contract.
                contractor = pd.value_counts(ncc_clean_data["Contractor"].values, sort=True).nlargest(8)

                labels = contractor.index
                values = contractor

                # pull is given as a fraction of the pie radius
                fig = go.Figure(data=[go.Pie(title_text="Eight (8) Most used Contractors\n",
                                             labels=labels, values=values, pull=[0.25, 0.25, 0.2, 0.15, 0.15, 0.1, 0.0, 0.0])])
                fig.show()
                print('\n\n')

                # Six (6) Means Cost and Gov Agency.
                group_data = ncc_clean_data 
                group_data["Cost"] = jmd
                group_data.groupby('Government Agency').Cost.mean().sort_values(ascending=True)[:6].plot.bar()
                plt.ylabel("JMD ($)")
                plt.title("\n\nSix (6) Means Cost & Gov Agency\n") 
                print('\n\n')

                # Six (6) Means Contract Cost & Funding Agency.
                group_data = ncc_clean_data 
                group_data["Cost"] = jmd
                group_data.groupby('Fund').Cost.mean().sort_values(ascending=True)[:6].plot.bar()
                plt.ylabel("JMD ($)")
                plt.title("\n\nEight (8) Means Contract Cost & Funding Agency\n") 

                # Ten (10) Miminum Contract Cost & Procurement Method used.
                group_data = ncc_clean_data 
                group_data["Cost"] = jmd
                group_data.groupby('Procurement Method').Cost.min().sort_values(ascending=True)[:10].plot.bar()
                plt.ylabel("JMD ($)")
                plt.title("\n\nTen Minimum  Contract Cost & Procurement Method used\n") 

                # Fifteen (15) Maximum Contract Cost & Dates Awarded.
                group_data = ncc_clean_data 
                group_data["Cost"] = jmd
                group_data.groupby('Date').Cost.max().sort_values(ascending=True)[:15].plot.bar()
                plt.ylabel("JMD ($)")
                plt.title("\n\nFifteen (15) Maximum Contract Cost & Dates Awarded") 
                input("\n\t\tPress Enter to return to main menu...")
                clear_output()
                
                # Visualization to identify features that have a linear relationship with each other.
                ncc_clean_data .pivot(columns='Procurement Method', values='Jamaican Equivalent').iplot(
                kind='box',
                yTitle='Jamaican Equivalent',
                title='Procurement Method used Jamaican Equivalent JMD($) Contract Cost')
                
                input("\n\t\tPress Enter to return to main menu...")
                clear_output()
                
                # -------------------------- Data Building ------------------------------------------------------------------ #
                # ----------------------------------------------------------------------------------------------------------- #
                inner_loop=True  
                # Ensure data model is before data visualization.
                flag = False
        
                while inner_loop:
                    sub_menu()
                    print('\n\n')
                   
                    # Accept user input for option here.
                    opt = str(input("\t\tEnter your choice [0 - 3]: "))
                    #................................................................................................................#
                    if opt=='0':  
                        clear_output()
                        print("\n\t\tReturning to main menu...")
                    
                        time.sleep(2)
                        clear_output()
                        inner_loop=False # This will make the while loop to end as not value of loop is set to False
                    #..........................................................................................................# 
                    #................................................................................................................#
                    elif opt=='1': 
                        
                        clear_output()
                        print("\n\t\tDECISION TREE CLASSIFICATION")
                        print("\t\t----------------\n")
                        
                        # Building model phase. 
                        # Splitting dataset into test and train data.
                        X_data, Y_data, X_train, X_test, y_train, y_test = splitdataset(ncc_model) 

                        
                        # Show Y and X data axises.
                        print_axis(X_data, Y_data)
                        print("\n")
                        
                        # Validates percetage split.
                        percentage_spit(X_train)
                        
                        input("\n\t\tPress Enter to continue...")
                        clear_output()
                        
                        # Create the classifier with a maximum depth of 2 using entropy as the criterion for choosing most 
                        #significant nodes to build the tree.
                        # Ensure the model is not overfitted "min_samples_split"
                        clf_entropy = tarin_using_entropy(X_train, y_train)
                        
                        clf_gini = train_using_gini(X_train, y_train)
    
                        print("\n SIGNIFIGCANT RESULTS USING ENTROPY") 
                        sig_features(clf_entropy, X_data)
                        print("\n")
                        
                        print("\n SIGNIFIGCANT RESULTS USING GINI") 
                        sig_features(clf_gini, X_data)
                        print("\n")
                        
                        input("\n\t\tPress Enter to continue...")
                        clear_output()
                        
                        
                        print("\nRESULTS USING ENTROPY") 
                        print("-----------------------") 
                        print("-----------------------\n") 
                        # Prediction using entropy. 
                        # Determine how many were predicted correctly.
                        y_pred_entropy = prediction(X_test, clf_entropy, y_test)  
                        cal_accuracy(y_test, y_pred_entropy) 
                        
                        input("\n\t\tPress Enter to continue...")
                        clear_output()
                        
                        print("\nRESULTS USING GINI")
                        print("------------------") 
                        print("------------------\n") 
                        # Prediction using gini. 
                        # Determine how many were predicted correctly.
                        y_pred_gini = prediction(X_test, clf_gini, y_test)
                        cal_accuracy(y_test, y_pred_gini) 
                        
                        input("\n\t\tPress Enter to continue...")
                        clear_output()
                        
                        # Visualize Decision tree using entropy.
                        entropy_graph = tree_visualization(ncc_data_encoded, clf_entropy, X_data, Y_data)  
                        graphviz.Source(entropy_graph) 
                        
                        # The Confusion matrix compares the actual target values with those predicted by the machine learning model.
                        entropy_cm = confusion_matrix(y_test, clf_entropy.predict(X_test), labels=y_test.unique())
                        entropy_cm
                        
                        plt.figure(figsize=(25,20))
                        plot_confusion_matrix(entropy_cm, ncc_clean_data['Funding Agency'].unique())
                        input("\n\t\tPress Enter to continue...")
                        clear_output()
                        
                        #Print text representation of Plot Tree with plot_tree.
                        print ("\n\n\t\t[---  Print text representation of Plot Tree with plot_tree ---]\n\n")
                        text_representation = tree.export_text(clf_entropy)
                        print(text_representation)
                        print("\n\n")
                        
                        # Visualize Decision using Plot Tree with plot_tree.
                        fig = plt.figure(figsize=(25,20))
                        _ = tree.plot_tree(clf_entropy,
                            feature_names=X_data.columns, 
                            class_names=ncc_data_encoded[Y_data.name].classes_,  
                            filled=True, rounded=True,  proportion=True,
                            node_ids=True)
                        
                        # Used save the figure to the .png file.
                        fig.savefig("decistion_tree.png")
                        
                        # Visualize Decision tree using gini.
                        gini_graph = tree_visualization(ncc_data_encoded, clf_gini, X_data, Y_data)  
                        graphviz.Source(gini_graph)
                        
                        gini_cm = confusion_matrix(y_test, clf_gini.predict(X_test), labels=y_test.unique())
                        pd.DataFrame(gini_cm)  
                        input("\n\t\tPress Enter to continue...")
                        clear_output()
                        
                        # The matrix compares the actual target values with those predicted by the machine learning model.
                        plt.figure(figsize=(8,8))
                        plot_confusion_matrix(gini_cm, ncc_clean_data['Funding Agency'].unique())
                        input("\n\t\tPress Enter to continue...")
                        clear_output()
                        
                        # Validates data model has been built for visualization.
                        flag = True
                        
                    #..........................................................................................................#
                    #................................................................................................................#
                    elif opt=='2': 
                        clear_output()   
                        
                        print("\n\t\tBUILD NEURAL NETWORK CLASSIFICATION MODEL")
                        print("\n\t\t=========================================")
                        print("\n")

                        dataset= ncc_data_df.copy()
                        encoded_model = model_data.copy()

                        # Building model phase. 
                        # Splitting dataset into test and train data.
                        X_features, Y_target, X_train_1, X_test_1, y_train_1, y_test_1 = splitdataset(encoded_model) 


                        # Show Y and X data axises.
                        print_axis(X_features, Y_target)
                        print("\n")


                        # Validates percetage split.
                        print("Validates percetage split : \n")
                        percentage_spit(X_train_1)

                        print("\n\n")

                        # Create an instance of linear regression.
                        reg = MLPClassifier()


                        # Fits a linear model
                        reg.fit(X_train_1, y_train_1)


                        # Number of layers utilized by the model.
                        print("Number of layers in model : ", reg.n_layers_)


                        # Predicting the Test set results
                        test_predicted = reg.predict(X_test_1)
                        test_predicted
                        input("\n\t\tPress Enter to continue...")
                        clear_output()


                        # Determine how many were predicted correctly.
                        print("Determine how many were predicted correctly :")
                        k = (reg.predict(X_test_1) == y_test_1) 
                        pd.DataFrame(k.value_counts())
                        

                        # Count how many were predicted correctly.
                        k.value_counts().iplot(kind='bar', xTitle='Prediction',
                                          yTitle='Count', title='Funding Agency Prediction Distribution')
                        input("\n\t\tPress Enter to continue...")
                        clear_output()


                        # with one slide exploded out
                        explode=(0.1, 0.0)
                        colors = ['gold', 'red']

                        k_count = k.value_counts()
                        k_df = pd.DataFrame({'labels': k_count.index, 'values': k_count.values})

                        k_df.iplot(kind='pie',labels='labels',values='values', colors=colors, title='Neural Network Funding Agency Prediction', hole = 0.5) 
                        input("\n\t\tPress Enter to continue...")
                        clear_output()

                        cm_network=confusion_matrix(y_test_1, reg.predict(X_test_1), labels=y_test_1.unique())
                        cm_network

                        # Confusion Matrix for Funding Agency. 
                        plt.figure(figsize=(6,6))
                        plot_confusion_matrix(cm_network, dataset['Funding Agency'].unique())
                        input("\n\t\tPress Enter to continue...")
                        clear_output()


                        #Evaluation for Neural Network Classification. 
                        cal_accuracy(y_test_1, test_predicted) 
                        print('\n\n')


                        network_accuracy = round((accuracy_score(y_test_1, test_predicted)*100), 1)
                        
                        # Validates data has been processed for visualization.
                        if flag == False:
                            print ("\n\t\t[....  PLEASE BUILD DECISION TREE TO INITIALIZE Y_TEST  ....]")
                            time.sleep(3)
                            clear_output()
                        elif flag == True:
                
                            entropy_accuracy = round((accuracy_score(y_test, y_pred_entropy)*100), 1)
                            gini_accuracy = round((accuracy_score(y_test, y_pred_gini)*100), 1)

                            entropy_accuracy
                            gini_accuracy
                            network_accuracy

                            input("\n\t\tPress Enter to continue...")
                            clear_output()


                            #Result for prediction accuracy of the model.
                            accuracy_results = pd.DataFrame({'Entropy_accuracy':[entropy_accuracy], 
                                                             'Gini_accuracy':[gini_accuracy], 'Network_accuracy':[network_accuracy]}) 

                            accuracy_results.iplot(kind='bar', xTitle='[Entrop] \t\t\t\t\t\t [Gini] \t\t\t\t [Nueral Network]',
                                                   yTitle='Percentage (%)', title='Entrop, Gini & Network Prediction Accuracy Score')

                            input("\n\t\tPress Enter to continue...")
                            clear_output()
                        
                    #..........................................................................................................#  
                    #..........................................................................................................#  
                    elif opt=='3': 
                        # Validates data has been processed for visualization.
                        if flag == False:
                            print ("\n\t\t[....  PLEASE BUILD DECISION TREE MODELS BEFORE VISUALIZTAIONS  ....]")
                            time.sleep(2)
                            clear_output()
                        elif flag == True:
                            clear_output()
                            print("\n\t\tDATASET VISUALIZATIONS ")
                            print("\n\t\t-----------------------")
                            
                            ncc_clean_data.iplot(kind="scatter", theme="white", x="Procurement Method", y="Jamaican Equivalent",
                            xTitle='Procurement Method', yTitle='Jamaican Equivalent', categories="Funding Agency")
                            
                            
                            input("\n\t\tPress Enter to continue...")
                            clear_output()

                            # Visualization to identify a correlation between Government Agency and Funding Agency.
                            #sns.histplot(ncc_clean_data['Fund'], ncc_clean_data['Government Agency'])
                            #sns.jointplot(ncc_clean_data['Fund'], ncc_clean_data['Government Agency'])

                            ncc_clean_data.pivot(columns='Jamaican Equivalent', values='Government Agency').iplot(
                            kind='box',
                            yTitle='Government Agency',
                            title='Jamaican Equivalent Distribution by Government Agency')


                            input("\n\t\tPress Enter to continue...")
                            clear_output()

                            # Visualization to identify a correlation between Government Agency and Funding Agency.
                            contractor_gov = JointPlotVisualizer(feature='Contractor', target='Government Agency')
                            contractor_gov.fit(X_data['Contractor'], X_data['Government Agency'])
                            contractor_gov.fig.suptitle('Correlation between Contractor and Government Agency') 
                            contractor_gov.poof()


                            input("\n\t\tPress Enter to continue...")
                            clear_output()

                            # Graph showing cost distribution of awarded c\ontracts. 
                            plt.hist(ncc_clean_data["Jamaican Equivalent"])
                            plt.title('Cost Distribution of Awarded Contracts')
                            plt.xlabel('JMD ($)')
                            plt.ylabel('Number');

                            input("\n\t\tPress Enter to continue...")
                            clear_output()


                            # Find the most common Procurement Method used by the Goverment Agencies.
                            p_method = ncc_model['Procurement Method']
                            p_method_values = ncc_clean_data['Procurement Method']

                            print("MOST FREQUENT Procurement Method used : ", stats.mode(p_method))
                            print("\n\n")
                            print("MOST FREQUENT Procurement Method used : ", stats.mode(p_method_values))

                            input("\n\t\tPress Enter to continue...")
                            clear_output()

                            # Graph showing the 10 most costly contracts that were awarded. 
                            contract_cost = ncc_model[['Date', 'Government Agency', 'Contractor', 'Jamaican Equivalent']]
                            contract_cost.nlargest(10, "Jamaican Equivalent").plot(kind="line", 
                            x='Contractor', y='Jamaican Equivalent', title="TEN MOST COSTLY CONTRACTS AWARDED", figsize=(10,8))
                            plt.title("Dates from %d to %d" % (contract_cost['Date'].min(), contract_cost['Date'].max()),size=8)
                            plt.suptitle("Contractor per Cost",size=12)
                            plt.ylabel("Jamaican Equivalent")

                            input("\n\t\tPress Enter to continue...")
                            clear_output()

                            # Data for customizinig pie chart.
                            sizes = [225, 130, 245, 210]
                            explode = (0.1, 0, 0.1, 0, 0.1,0.1, 0, 0.1, 0, 0.1)  # explode 1st slice

                            # Graph showing the 10 most costly contracts that were awarded. 
                            contract_cost = ncc_model[['Date', 'Government Agency', 'Contractor', 'Jamaican Equivalent']]
                            contract_cost.nlargest(10, "Jamaican Equivalent").plot.pie(autopct='%2.1f%%', explode=explode,
                            x='Contractor', y='Jamaican Equivalent', title="TEN MOST COSTLY CONTRACTS AWARDED", figsize=(10,8))
                            plt.title("Dates from %d to %d" % (contract_cost['Date'].min(), contract_cost['Date'].max()),size=8)
                            plt.suptitle("Ten(10) Most Costly Contractors per Cost",size=12)
                            plt.ylabel("Jamaican Equivalent")


                            input("\n\t\tPress Enter to continue...")
                            clear_output()


                            # Graph showing the 10 most costly contracts that were awarded group by Procurement Method.
                            gov_agency = ncc_data.nlargest(10, "Jamaican Equivalent")
                            print(gov_agency.boxplot(column='Jamaican Equivalent', by = 'Procurement Method', figsize=(10,8)))


                            input("\n\t\tPress Enter to continue...")
                            clear_output()


                            # Visualization to identify features that have a linear relationship with each other.
                            #correlation = Rank2D(algorithm="pearson")
                            #correlation.fit_transform(X_data)
                            #correlation.poof()


                            input("\n\t\tPress Enter to continue...")
                            clear_output()

                            # Visualization to identify a correlation between Government Agency and Funding Agency.
                            contractor_gov = JointPlotVisualizer(feature='Fund', target='Government Agency')
                            contractor_gov.fit(X_data['Fund'], X_data['Government Agency'])
                            contractor_gov.fig.suptitle('Correlation between Contractor and Government Agency') 
                            contractor_gov.poof()

                            input("\n\t\tPress Enter to continue...")
                            clear_output()
                                                     
                    #..........................................................................................................#               
                    #..........................................................................................................#
                    else:
                        # Any integer inputs other than values 0 - 2 we print an error message
                        print("\n\t\t\tWrong option selection! Please try again...")
                        time.sleep(2)
                        clear_output()
            #..........................................................................................................#
            #..........................................................................................................#   
            else:
                # Any integer inputs other th1an values 0 & 1 we print an error message
                print("\t\t\tWrong option selection! Please try again...")
                time.sleep(2)
                clear_output()
    #----------------------------------------------------------------------------------------------------------------#          
    except IOError:
        print ("*ERROR LOADING APPLICATION!")  
        clear_output()
    #----------------------------------------------------------------------------------------------------------------#          

In [None]:
## # 
## 
### # ## ## Driver code for main program.
def main(): 
        app_menu()

## Calling main function. 
if __name__=="__main__":
    
    main()
## Codes end here.
#----------------------------------------------------------------------------------------------------------------#          



		~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

		~~                                                        ~~

		~~      DATA WAREHOUSE & DATA MINING VISUALIZATION        ~~

		~~               (BUILD DATA MODEL)                       ~~

		~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

		~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

		~~                                                        ~~

		~~          ->0. RETURN TO MAIN                           ~~

		~~          ->1. DECISION TREE CLASSIFICATION             ~~

		~~          ->2. NEURAL NETWORK CLASSIFICATION            ~~

		~~          ->3. DATASET VISUALIZATIONS                   ~~

		~~                                                        ~~

		~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



