# Segmentation Pipeline

In [None]:
# import libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib


In [None]:
# get the input dataset
input_data = pd.read_csv('E:/Research/WebApp/input.csv')

In [None]:
# load the model
rfc = joblib.load('E:/Research/Models/Classifiers/random_forest_classifier_model.pkl')

In [None]:
# change the headers to ease of use
header_map = {
    "How likely are you to recommend WSO2 to a friend_ or colleague on a scale from 0 to 10? [0 being not at all likely and 10 being extremely likely]":'likely_to_recomend',
    "How satisfied are you with the support given by the WSO2 team?":'satisfaction',
    "Which response best captures the main impact of our product?":'product_impact',
    "How responsive have we been to your questions or concerns about our products?":'responsiveness'
}
input_data.rename(columns=header_map,inplace=True)

In [None]:
input_data['Sub Region'].unique()

In [None]:
# select the necessary features
input_data = input_data[['ResponseID','likely_to_recomend','satisfaction','responsiveness','product_impact','Account Name','Sales Region']]
input_data.dropna(inplace=True)

In [None]:
# ordinal encoding on features
h1_map = {"Excellent":5,"Good":4,"Okay":3,"Bad":2,"Terrible":1}
h2_map = {"Excellent":4,"Good":3,"OK":2,"Slow":1}
h3_map = {"Many of the above":9,"High Quality":8,"Scalable":7,"Value for Money":6,"Useful":5,"Reliable":4,"Secure":3,"Unique":2,"None of the above":1}
# --- satisfaction ----
input_data['encoded_satisfaction'] = input_data.satisfaction.map(h1_map)
input_data = input_data.drop(['satisfaction'],axis=1)
# --- responsiveness ---
input_data['encoded_responsiveness'] = input_data.responsiveness.map(h2_map)
input_data = input_data.drop(['responsiveness'],axis=1)
# --- product_impact ----
input_data['encoded_product_impact'] = input_data.product_impact.map(h3_map)
input_data = input_data.drop(['product_impact'],axis=1)

# one-hot encoding for string values
encoded_new_d = pd.get_dummies(input_data,columns=['encoded_product_impact'],dtype=int)
input_data = encoded_new_d

In [None]:
# prediction
input = input_data.drop(['ResponseID','Account Name','Sales Region'],axis=1)
y_pred = rfc.predict(input)

In [None]:
y_pred[188]

In [None]:
# assign prediction to dataset
input_data['-        Health        -'] = y_pred 

In [None]:
output_data = input_data[['Account Name','-        Health        -','Sales Region']]
def aggregate_Health(output_data):
    output = output_data
    health_val_map = {"Good":4,"Need improvement":3,"Need more attention":2,"At risk":1}
    output['-        Health        -'] = output['-        Health        -'].map(health_val_map)

    account_names = output['Account Name'].unique()  # unique account names
    df_temp = pd.DataFrame()       # create an empty dataframe
    health = []                    # create empty array to assign health
    sales_region = []


    for account in account_names:
        mean_of_health = output[output['Account Name']==account]['-        Health        -'].mean()
        mean_of_health = round(mean_of_health,0)
        health.append(mean_of_health)

        region = output[output['Account Name'] == account]['Sales Region'].unique()[0]
        sales_region.append(region)

    df_temp['Account Name'] = account_names
    df_temp['-        Health        -'] = health
    df_temp['Sales_Region'] = sales_region


    output = df_temp
    health_val_map = {4:"Good",3:"Need improvement",2:"Need more attention",1:"At risk"}
    output['-        Health        -'] = output['-        Health        -'].map(health_val_map)
    return output

a= aggregate_Health(output_data)

In [None]:
a

In [None]:
a['Sales_Region'].unique().tolist()

# Health Score pipeline

In [None]:
# import libraries
import pandas as pd

In [None]:
# get the input dataset
input = pd.read_csv('E:/Research/WebApp/input.csv')
input_data = input

In [None]:
# change the headers to ease of use
header_map = {
    "How likely are you to recommend WSO2 to a friend_ or colleague on a scale from 0 to 10? [0 being not at all likely and 10 being extremely likely]":'likely_to_recomend',
    "How satisfied are you with the support given by the WSO2 team?":'satisfaction',
    "Which response best captures the main impact of our product?":'product_impact',
    "How responsive have we been to your questions or concerns about our products?":'responsiveness'
}
input_data.rename(columns=header_map,inplace=True)

In [None]:
# ordinal encoding on features
h1_map = {"Excellent":5,"Good":4,"Okay":3,"Bad":2,"Terrible":1}
h2_map = {"Excellent":4,"Good":3,"OK":2,"Slow":1}
h3_map = {"Many of the above":9,"High Quality":8,"Scalable":7,"Value for Money":6,"Useful":5,"Reliable":4,"Secure":3,"Unique":2,"None of the above":1}
# --- satisfaction ----
input_data['encoded_satisfaction'] = input_data.satisfaction.map(h1_map)
input_data = input_data.drop(['satisfaction'],axis=1)
# --- responsiveness ---
input_data['encoded_responsiveness'] = input_data.responsiveness.map(h2_map)
input_data = input_data.drop(['responsiveness'],axis=1)
# --- product_impact ----
input_data['encoded_product_impact'] = input_data.product_impact.map(h3_map)
input_data = input_data.drop(['product_impact'],axis=1)

# # one-hot encoding for string values
# encoded_new_d = pd.get_dummies(input_data,columns=['encoded_product_impact'],dtype=int)
# input_data = encoded_new_d

In [None]:
# label encoding for categorical features
from sklearn import preprocessing 

features = ['Sub Region','Account Name','Account Manager Name','Segment','Sales Region','completion']
label_encoder = preprocessing.LabelEncoder() 
for feature in features:
    input_data[feature] = label_encoder.fit_transform(input_data[feature])

In [None]:
# inference
X = input_data[['completion', 'Sales Region', 'Sub Region', 'Account Manager Name','Segment', 'encoded_product_impact']]
X.dropna(inplace=True)

# load the model
import pickle
filename = 'E:\Research\Models/GradientBoostingRegressorModel3.pkl'
model = pickle.load(open(filename, 'rb'))

# predict 
y = model.predict(X)

In [None]:
# assign health score to dataset 
healthscore_dataset = input_data[['ResponseID','completion', 'Sales Region', 'Sub Region', 'Account Manager Name','Segment', 'encoded_product_impact']]
healthscore_dataset.dropna(inplace=True)
healthscore_dataset['Health_Score'] = y

In [None]:
healthscore_dataset.head(1)

In [None]:
input[['ResponseID','Account Name','Account Manager Name','Sales Region','ARR','dateTime']].head(1)

In [None]:
# save dataset with health
d1 = input[['ResponseID','Account Name','Account Manager Name','Sales Region','ARR','dateTime']]
d2 = healthscore_dataset[['ResponseID','Health_Score']]

final_dataset = pd.merge(d2,d1,how='right',on='ResponseID')
final_dataset.head(1)

In [None]:
final_dataset.dropna(inplace=True)