In [None]:
'''
Datasets : Case Report details , NPS Survey
Input : Customer Case History
Output : Healthscore for each account
'''

In [None]:
# Required Libraries
import pandas as pd
import warnings
import numpy as np

# Import custom modules
import sys
sys.path.append("../src/TrainPipelines/")

from Preprocessing.RecordAgreement import RecordAgreement
from Preprocessing.RemoveOutliers import RemoveOutliers
from Preprocessing.FillingMissingVlaues import FillingMissingValues
from Preprocessing.Labeling import Labeling
from Preprocessing.Encoder import Encoder

In [None]:
a = pd.read_excel('C:/Users/gimhanSandeeptha/Gimhan Sandeeptha/Sentiment Project\CustomerHealthScore\Data\sn_customerservice_case_report_1.xlsx')
b = pd.read_csv('')
c = pd.read_csv('')

#### Load dataset

In [None]:
# Load datasets
warnings.filterwarnings('ignore')
accountDataset = pd.read_excel('E:/Research/Datasets/WSO2/Original dataset/customer_account.xlsx')
caseDataset    = pd.read_csv('E:/Research/Datasets/WSO2/Original dataset/Case Report/data_with_created_date/CaseReport.csv')
nps            = pd.read_csv('E:/Research/Datasets/WSO2/Original dataset/NPS.csv')

In [None]:
caseDataset.nunique()

In [None]:
caseDataset.isna().sum()

In [None]:
# nps.isna().sum()

#### Functions

In [None]:
def adjustingDataset(df):
    nps = df

    # change feature names for ease of use
    header_map = {
        "How likely are you to recommend WSO2 to a friend_ or colleague on a scale from 0 to 10? [0 being not at all likely and 10 being extremely likely]":'likely_to_recomend',
        "How satisfied are you with the support given by the WSO2 team?":'satisfaction',
        "Which response best captures the main impact of our product?":'product_impact',
        "How responsive have we been to your questions or concerns about our products?":'responsiveness'
    }
    nps.rename(columns=header_map,inplace=True)


    # drop the columns that have many null values
    temp_d1 = nps[['Account Name', 'Account Manager Name', 'UserName',
            'UserID', 'ResponseID',  'timeStamp',
        'dateTime',  'country',  'completion',
        'likely_to_recomend',
        'satisfaction',
        'responsiveness',
        'product_impact',
        'Sales Region', 'Sub Region', 'Survey Campaign', 'Segment']]


    # Filling missing values in Sales Region
    temp_d2 = temp_d1
    RegionMAP = np.load('../Data/Region_Map.npy',allow_pickle='TRUE').item()           # Region Map
    temp_d2['Sales Region'] = temp_d2['Sales Region'].fillna(temp_d2['Sub Region'].map(RegionMAP))
    return temp_d2

In [None]:
def fcrPercantage(col):
    true_count  = col.sum()
    total_count = len(col)
    percentage_true = (true_count / total_count) * 100
    return percentage_true

In [None]:
def getChunkofData(df,start='2022-02-01',end='2022-08-01'):
    df['Created'] = pd.to_datetime(df['Created'], format='%Y-%m-%d %H:%M:%S')

    start_date  = pd.to_datetime(start)
    end_date    = pd.to_datetime(end)

    filtered_df = df[(df['Created'] >= start_date) & (df['Created'] <= end_date)]
    return filtered_df

In [None]:
def getChunksBy6Months(df, start='2022-02-01', end='2022-08-01'):
    start_date = pd.to_datetime(start)
    end_date = pd.to_datetime(end)
    six_months = pd.DateOffset(months=6)

    chunks = []
    while start_date < end_date:
        chunk_end_date = start_date + six_months
        if chunk_end_date > end_date:
            chunk_end_date = end_date
        chunk = getChunkofData(df=df, start=start_date, end=chunk_end_date)
        chunks.append(chunk)
        start_date += six_months

    return chunks

In [None]:
def getaggregatedList(df_list):
    agg_list = []
    for df in df_list:
        agg_methods = {
        'Time To Resolve': 'mean',   
        'Agent Reassignment Count': 'mean',     
        'First Contact Resolution': fcrPercantage,
        'Reopen Count':'mean'
        
        }
        accountWiseCaseDataset = df.groupby('Account').agg(agg_methods)
        accountWiseCaseDataset = accountWiseCaseDataset.reset_index()
        agg_list.append(accountWiseCaseDataset)
    return agg_list

In [None]:
def getNPSbySurveys(df):
    dataset   =  df[['Account Name','encoded_product_impact','healthScore','Survey Campaign','Sales Region','Sub Region','completion']]
    dataset['Survey Campaign'] = pd.to_datetime(dataset['Survey Campaign'], format='%Y-%b')

    dfs = []
    for date, group in dataset.groupby('Survey Campaign'):
        dfs.append(group)

    mean_dfs = []
    for df in dfs:
        mean_df = df.groupby('Account Name').mean()
        mean_df = mean_df.reset_index()
        mean_dfs.append(mean_df)
    return mean_dfs

In [None]:
def concatanateData(tables,nps_list):
    datasets = []
    for df,nps in zip(tables,nps_list):
        if isinstance(nps,pd.DataFrame):
            concat_data   = pd.merge(df, nps[['Account Name','encoded_product_impact','Sales Region','Sub Region','completion','healthScore']], left_on='Account', right_on='Account Name', how='inner')
            datasets.append(concat_data)
        elif nps == None:concat_data  = concatenated_df

    concatenated_df = pd.concat(datasets, ignore_index=True)
    return concatenated_df
    

#### Model (Filling missing values in is_fcr)

In [None]:
'''Correlation of features'''
import warnings
import itertools
import researchpy as rp
import matplotlib.pyplot as plt
import seaborn as sn

df               = caseDataset.drop(['Unnamed: 0'],axis=1).copy()
combinations     = list(itertools.combinations(df.dropna().columns, 2))
cramers_v_values = pd.DataFrame(index=df.columns, columns=df.dropna().columns)

# Calculate Cramér's V for each pair
for feature1, feature2 in combinations:
    crosstab, results                           = rp.crosstab(df.dropna()[feature1], df.dropna()[feature2], test='chi-square')
    cramers_v_values.loc[feature1, feature2]    = results.loc[2, 'results']
    warnings.filterwarnings('ignore')


# Create a heatmap
cramers_v_values = cramers_v_values.apply(pd.to_numeric)
plt.figure(figsize=(10,10))
sn.heatmap(cramers_v_values, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title("Cramér's V Heatmap")
plt.show()

In [None]:
caseDataset.describe()

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder() 
temp = caseDataset.copy()
temp['First Contact Resolution']   = label_encoder.fit_transform(caseDataset['First Contact Resolution'])

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [None]:
temp[['First Contact Resolution','Time To Resolve','Agent Reassignment Count','Reopen Count']].corr()

In [None]:
'''Creatting model'''
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

df = caseDataset.dropna()

X = df[['Time To Resolve']]
y = df[['First Contact Resolution']]

label_encoder = preprocessing.LabelEncoder() 
y['First Contact Resolution']   = label_encoder.fit_transform(y['First Contact Resolution'])

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.2)

model = LogisticRegression()
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)

score =accuracy_score(Y_test,y_pred)
print("Accuracy:- ",score)

In [None]:
import pickle

filename = 'E:/Research/CHS_Repo/CustomerHealthScoreB2B/Models/Imputation_Model/fcrimputationModel_v2.pkl'
pickle.dump(model, open(filename, 'wb'))

#### Clean the dataset

In [None]:
'''
# Clean Datasets #
Input :accountDataset , caseDataset, nps
Output: accountDataset2 ,filledcaseDataset ,labeledDataset
'''
import pickle
import sys
from sklearn import preprocessing


'''NPS Dataset'''
# Get encoded dataframe
temp_d2 =  adjustingDataset(nps)
encode  = Encoder(temp_d2)
temp_d3 =   encode.customEncoder()

# Get agreement between records
agreement       = RecordAgreement(temp_d3)                             # create an object of RecordAgreement class
highAgreementdf =  agreement.gethighAgreementSurveys() 

# Remove outliers
temp_df1    = highAgreementdf
outlierObj  =  RemoveOutliers(temp_df1)
filtered_df = outlierObj.removeOutliers()

# Filling missing values
temp_df2    = filtered_df
fm          = FillingMissingValues(temp_df2,None)
filled_df,_  = fm.getFilledDataset()

# Labeling nps dataset
labeling        =  Labeling(filled_df)
labeledDataset  =  labeling.returnLabeleddf()

temp = labeledDataset.copy()
columns = ['completion','Sales Region','Sub Region']
for col in columns:
    label_encoder = preprocessing.LabelEncoder() 
    temp[col]  = label_encoder.fit_transform(temp[col])
    label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    print(label_mapping)
encoded_dataset = temp

'''Account Datasets'''

accountDataset2 = accountDataset.drop(['Support Tier'],axis=1)
RegionMAP = np.load('../Data/Region_Map.npy',allow_pickle='TRUE').item()           # Region Map
accountDataset2['Sales Region'] = accountDataset2['Sales Region'].fillna(accountDataset2['Sub Region'].map(RegionMAP))
accountDataset2 = accountDataset2.drop_duplicates(subset=['Name(name)'])
accountDataset2 = accountDataset2.dropna(subset=['Sub Region','Sales Region'])

'''Case Dataset'''

# Filling missing values

sys.path.append("..")
filename = 'E:/Research/CHS_Repo/CustomerHealthScoreB2B/Models/Imputation_Model/fcrimputationModel.pkl'
model = pickle.load(open(filename, 'rb'))
temp_df = caseDataset.copy()

def custom_encode(boolean_value):
    if pd.isnull(boolean_value):
        return boolean_value
    else:
        return 1 if boolean_value else 0
temp_df['First Contact Resolution'] = temp_df['First Contact Resolution'].map(custom_encode)


X_missing = temp_df[temp_df['First Contact Resolution'].isnull()][['Time To Resolve']]
predicted_values = model.predict(X_missing)

filledcaseDataset = temp_df.drop(['Unnamed: 0'],axis=1).copy()
filledcaseDataset.loc[filledcaseDataset['First Contact Resolution'].isnull(), 'First Contact Resolution'] = predicted_values
filledcaseDataset = filledcaseDataset.dropna(subset=['Account'])

In [None]:
filledcaseDataset.isna().sum()

#### Merge the Dataset

In [None]:
'''Cleaning Account Names'''
import re

caseReportTable = filledcaseDataset.copy()

def clean_account_name(account_name):
    cleaned_name = re.sub(r'^(?:ZZZ:LOST\s?--?|ZZZ:LOST\s?-\s?|ZZZ: LOST--\s?|ZZZ: Lost - |ZZZ:Lost - |ZZZ:Lost -- |\d{4}-\d{2}-\d{2}-?\s?)', '', account_name)
    return cleaned_name.strip()

caseReportTable['Account'] = caseReportTable['Account'].apply(clean_account_name)
caseReportTable['Account'] = caseReportTable['Account'].apply(clean_account_name)


In [None]:
# Case Report
'''Aggregate the data in case report'''

datasets = getChunksBy6Months(df=caseReportTable,start='2021-08-01',end='2024-02-01')
tempdfs = getaggregatedList(datasets)
nps_list = getNPSbySurveys(df=encoded_dataset)
dataset  = concatanateData(tempdfs,nps_list)



In [None]:
dataset.head(2)

#### Data analysis

In [None]:
import shap
import matplotlib.pyplot as plt

# Separate features (X) and target variable (y)
X = dataset[['Time To Resolve', 'Agent Reassignment Count',
            'First Contact Resolution', 'Reopen Count', 
        'encoded_product_impact', 'Sales Region', 'Sub Region', 'completion']]
y = dataset['healthScore']

from sklearn.linear_model import Ridge

model = Ridge()
model.fit(X, y)

explainer = shap.Explainer(model, X)
shap_values = explainer(X)

shap.summary_plot(shap_values, X)
plt.show()


In [None]:
import shap
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge

X = dataset[['Time To Resolve', 'Agent Reassignment Count',
            'First Contact Resolution', 'Reopen Count', 
        'encoded_product_impact', 'Sales Region', 'Sub Region', 'completion']]
y = dataset['healthScore']

model = Ridge()
model.fit(X, y)

explainer = shap.LinearExplainer(model, X)
shap_values = explainer(X)

# Visualize the relationship between input variables and target variable using line charts
for index in range(len(X.columns)):
    shap.dependence_plot(int(index), shap_values.values, X,show=False)
    plt.title(f'Relationship between {X.columns[index]} and Target Variable')
    plt.show()


#### Checking correlations

In [None]:
# mergedDf.drop(['Survey Campaign','Account'],axis=1).to_csv('E:/Research/CHS_Repo/CustomerHealthScoreB2B/Data/merged_dataset.csv')

In [None]:
# Check the distribution
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for each feature
plt.figure(figsize=(12, 6))
for i, column in enumerate(dataset.drop(['Account','Account Name'],axis=1).columns):
    plt.subplot(3, 3, i+1)
    sns.histplot(dataset[column], kde=True) 
    plt.title(f'Distribution of {column}')
plt.tight_layout()
plt.show()

In [None]:
trainDf = dataset.drop(['Account'],axis=1)

In [None]:
trainDf.describe()

In [None]:
trainDf.drop(['Account Name'],axis=1).corr(method='spearman')['healthScore']

#### Creating Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso,BayesianRidge                     
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import math

X = trainDf.drop(columns=['healthScore','Account Name'])
y = trainDf['healthScore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Ridge(alpha=0.1)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = math.sqrt(mse)
print("Root Mean Squared Error:", rmse)


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
                'alpha': [0.000001,0.001,0.01,0.015, 0.1,0.6, 1.0, 10.0]
            }

grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)
bestParams  = grid_search.best_params_
bestParams