In [None]:
# import libraries
import pandas as pd

In [None]:
# read dataset
dataset = pd.read_csv('E:\Research\Datasets\WSO2/dataset2.csv')
dataset.dtypes

In [None]:
# change feature names for ease of use
header_map = {
    "How likely are you to recommend WSO2 to a friend_ or colleague on a scale from 0 to 10? [0 being not at all likely and 10 being extremely likely]":'likely_to_recomend',
    "How satisfied are you with the support given by the WSO2 team?":'satisfaction',
    "Which response best captures the main impact of our product?":'product_impact',
    "How responsive have we been to your questions or concerns about our products?":'responsiveness'
}

dataset.rename(columns=header_map,inplace=True)
dataset.isna().sum()

In [None]:
dataset.nunique()

In [None]:
nps_dataset = dataset[['ResponseID','likely_to_recomend','satisfaction','responsiveness','product_impact']]
nps_dataset

In [None]:
print(nps_dataset['satisfaction'].unique(),'\n',nps_dataset['responsiveness'].unique(),'\n',nps_dataset['product_impact'].unique())

In [None]:
# ordinal encoding on features
h1_map = {"Excellent":5,"Good":4,"Okay":3,"Bad":2,"Terrible":1}
h2_map = {"Excellent":4,"Good":3,"OK":2,"Slow":1}
h3_map = {"Many of the above":9,"High Quality":8,"Scalable":7,"Value for Money":6,"Useful":5,"Reliable":4,"Secure":3,"Unique":2,"None of the above":1}

# --- satisfaction ----
nps_dataset['encoded_satisfaction'] = nps_dataset.satisfaction.map(h1_map)
nps_dataset = nps_dataset.drop(['satisfaction'],axis=1)

# --- responsiveness ---
nps_dataset['encoded_responsiveness'] = nps_dataset.responsiveness.map(h2_map)
nps_dataset = nps_dataset.drop(['responsiveness'],axis=1)

# --- product_impact ----
nps_dataset['encoded_product_impact'] = nps_dataset.product_impact.map(h3_map)
nps_dataset = nps_dataset.drop(['product_impact'],axis=1)

nps_dataset.head()

## Weighted Score

In [None]:
# calculate score
# manual_score_dataset= nps_dataset.drop(['Clusters'],axis=1)
manual_score_dataset= nps_dataset

health_scores = []
features = ['likely_to_recomend','encoded_satisfaction','encoded_responsiveness']
weights = [50,30,20]

# normalize
for feature,weight in zip(features,weights):
    if abs(manual_score_dataset[feature].max()) == 0:
        continue
    manual_score_dataset[feature] = (manual_score_dataset[feature]/manual_score_dataset[feature].max())*weight

# calculate health score
manual_score_dataset['health_score'] = (manual_score_dataset['likely_to_recomend'] + manual_score_dataset['encoded_satisfaction'] + manual_score_dataset['encoded_responsiveness'] )*100/sum(weights)

# manual_score_dataset['health_score'] = health_scores
manual_score_dataset

In [None]:
# add health score to original dataset
main_dataset = pd.read_csv('E:\Research\Datasets\WSO2/dataset2.csv')


# change feature names for ease of use
header_map = {
    "How likely are you to recommend WSO2 to a friend_ or colleague on a scale from 0 to 10? [0 being not at all likely and 10 being extremely likely]":'likely_to_recomend',
    "How satisfied are you with the support given by the WSO2 team?":'satisfaction',
    "Which response best captures the main impact of our product?":'product_impact',
    "How responsive have we been to your questions or concerns about our products?":'responsiveness'
}

main_dataset.rename(columns=header_map,inplace=True)

healthscore_main_dataset = pd.merge(main_dataset,manual_score_dataset[['ResponseID','health_score']],on='ResponseID',how='left')
healthscore_main_dataset.nunique()

In [None]:
# drop unwanted columns

regressor_dataset = healthscore_main_dataset[['ResponseID','likely_to_recomend','satisfaction','responsiveness','product_impact','Country_with_city','completion','Sales Region','Sub Region','Account Name','Account Manager Name','Segment','health_score']]
regressor_dataset

In [None]:
# encode the data

# ordinal encoding on ordinal features
h1_map = {"Excellent":5,"Good":4,"Okay":3,"Bad":2,"Terrible":1}
h2_map = {"Excellent":4,"Good":3,"OK":2,"Slow":1}
h3_map = {"Many of the above":9,"High Quality":8,"Scalable":7,"Value for Money":6,"Useful":5,"Reliable":4,"Secure":3,"Unique":2,"None of the above":1}

# --- satisfaction ----
regressor_dataset['encoded_satisfaction'] = regressor_dataset.satisfaction.map(h1_map)
regressor_dataset = regressor_dataset.drop(['satisfaction'],axis=1)

# --- responsiveness ---
regressor_dataset['encoded_responsiveness'] = regressor_dataset.responsiveness.map(h2_map)
regressor_dataset = regressor_dataset.drop(['responsiveness'],axis=1)

# --- product_impact ----
regressor_dataset['encoded_product_impact'] = regressor_dataset.product_impact.map(h3_map)
regressor_dataset = regressor_dataset.drop(['product_impact'],axis=1)


# label encoding for categorical features
from sklearn import preprocessing 

features = ['Country_with_city','Sub Region','Account Name','Account Manager Name','Segment','Sales Region','completion']
label_encoder = preprocessing.LabelEncoder() 
for feature in features:
    regressor_dataset[feature] = label_encoder.fit_transform(regressor_dataset[feature])

regressor_dataset.head()

In [None]:
regressor_dataset.drop(['ResponseID','likely_to_recomend','encoded_responsiveness','encoded_satisfaction'],axis=1).corr()

In [None]:
# save regressor dataset
regressor_dataset.to_csv('E:\Research\Datasets\WSO2\Healthscore_dataset\Regressor_Data/regressor_dataset3.csv',index=False)

In [None]:
len(regressor_dataset['health_score'].unique())

# Choosing a model for regression

In [None]:
%pip install xgboost

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso,BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import warnings

In [None]:
# suffle the dataset
X = regressor_dataset.drop(['ResponseID','health_score','likely_to_recomend','encoded_responsiveness','encoded_satisfaction','Country_with_city','Account Name'],axis=1)
y = regressor_dataset[['health_score']]
X, y = shuffle(X, y, random_state=42)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# trying out different models

# List of regression models to try
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'XGBRegressor': xgb.XGBRFRegressor(objective ='reg:squarederror'),
    'Baysian Regressor':BayesianRidge()
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    warnings.filterwarnings('ignore')
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print(f"{model_name}: Mean Squared Error = {mse}")

According to mse values above GradientBoostingRegressor give the best value. So Let use Linear Regression for our regression task

In [None]:
# GradientBoostingRegressor
from sklearn.metrics import accuracy_score,precision_score,recall_score

X,y = shuffle(X,y,random_state=40)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = GradientBoostingRegressor()

model.fit(X_train,y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
print('MSE score  : ',mse)

In [None]:
X_test.columns

In [None]:
X_test.head()

In [None]:
y_test.head()

In [None]:
# save the model
import pickle

file = 'E:\Research\Models/GradientBoostingRegressorModel3.pkl'
with open(file, 'wb') as model_file:
    pickle.dump(model, model_file)