# Understanding Impact of a Behaviourally-Optimised Call Script on Customer Perceptions

In [1]:
# Load packages

from nlp import obtain_corpus, normalise_corpus, build_feature_matrix, get_topics_terms_weights, print_topics_udf
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sentiment import analyse_sentiment_textblob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from clean_task2 import exclude_february, data_segmentation, parse_date_safe

### Data Cleaning

In [2]:
# Load data
df_control = pd.read_excel('../data_source/CDS_25_Task2.xlsx', 'C Control')
df_treatment = pd.read_excel('../data_source/CDS_25_Task2.xlsx', 'C Pilot')
print(df_treatment['TO_CHAR'].unique())
# Exclude February data
df_control = exclude_february(df_control)

df_treatment['TO_CHAR'] = df_treatment['TO_CHAR'].astype(str).str.replace(r'(?i)company', '02', regex=True)
df_treatment['TO_CHAR'] = df_treatment['TO_CHAR'].apply(parse_date_safe)
df_treatment = exclude_february(df_treatment)

[datetime.datetime(2023, 2, 1, 0, 0) '01/03/2company3' '01/04/2company3'
 '01/05/2company3' '01/06/2company3']


In [None]:
# Combine dataframes
df_combined = pd.concat([df_control, df_treatment], ignore_index=True)
# Segment the dataset into VOLT, non-VOLT, and their respective control and treatment groups
df_v, df_nv, df_v_control, df_nv_control, df_v_treatment, df_nv_treatment = data_segmentation(df_combined)

In [2]:
df_control = pd.read_excel('../data_source/CDS_25_Task2.xlsx', 'C Control')
general_corpus = obtain_corpus(df_control)
norm_corpus = normalise_corpus(general_corpus)

['Conservations', 'with', 'your', 'staff', 'over', 'the', 'phone', 'were', 'quick', ',', 'clear', ',', 'and', 'very', 'helpful', ',', 'arranging', 'for', 'installation', 'sooner', 'than', 'I', 'expected', '.', 'The', 'installer', 'was', 'a', 'lovely', 'guy', ',', 'very', 'efficient', 'and', 'cleaned', 'up', 'what', 'little', 'debris', 'the', 'drilling', 'caused', '.']
[('Conservations', 'NOUN'), ('with', 'ADP'), ('your', 'PRON'), ('staff', 'NOUN'), ('over', 'ADP'), ('the', 'DET'), ('phone', 'NOUN'), ('were', 'VERB'), ('quick', 'ADJ'), (',', '.'), ('clear', 'ADJ'), (',', '.'), ('and', 'CONJ'), ('very', 'ADV'), ('helpful', 'ADJ'), (',', '.'), ('arranging', 'VERB'), ('for', 'ADP'), ('installation', 'NOUN'), ('sooner', 'NOUN'), ('than', 'ADP'), ('I', 'PRON'), ('expected', 'VERB'), ('.', '.'), ('The', 'DET'), ('installer', 'NOUN'), ('was', 'VERB'), ('a', 'DET'), ('lovely', 'ADJ'), ('guy', 'NOUN'), (',', '.'), ('very', 'ADV'), ('efficient', 'ADJ'), ('and', 'CONJ'), ('cleaned', 'VERB'), ('up'

In [3]:
vectoriser, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf')

In [4]:
total_topics = 2
lda = LatentDirichletAllocation(n_components=total_topics, max_iter=100, learning_method='online', learning_offset=50., random_state=42)
lda.fit(tfidf_matrix)

In [5]:
feature_names = vectoriser.get_feature_names_out()
weights = lda.components_

In [6]:
topics = get_topics_terms_weights(weights, feature_names)
print_topics_udf(topics=topics, total_topics=total_topics, num_terms=8, display_weights=True)

Topic #1 with weights
[(np.str_('nan'), 94.33), (np.str_('helpful'), 4.01), (np.str_('patient'), 2.48), (np.str_('join'), 1.99), (np.str_('extremely'), 1.77), (np.str_('strange'), 1.55), (np.str_('informative'), 1.5), (np.str_('response'), 1.49)]
Topic #2 with weights
[(np.str_('service'), 24.38), (np.str_('good'), 19.57), (np.str_('customer'), 13.7), (np.str_('helpful'), 13.36), (np.str_('friendly'), 13.18), (np.str_('excellent'), 9.15), (np.str_('great'), 8.22), (np.str_('polite'), 7.84)]


In [7]:
# Sentiment Analysis

sentiment_results = [analyse_sentiment_textblob(doc, verbose=False) for doc in general_corpus]
sentiment_labels, polarities, subjectivities = zip(*sentiment_results)
sentiment_df = pd.DataFrame({
    'document': general_corpus,
    'sentiment_label': sentiment_labels,
    'polarity': polarities,
    'subjectivity': subjectivities
})
print(sentiment_df.head())
sentiment_df.to_csv('../data_source/sentiment_analysis_results.csv', index=False)

                                            document sentiment_label  \
0  Conservations with your staff over the phone w...        positive   
1  I spoke to tony and he was lovely, he talked m...        positive   
2                                                NaN        negative   
3  Very positive so far. Cheaper, more included a...        positive   
4  They was on time. Done a good job. Was very po...        positive   

   polarity  subjectivity  
0      0.15          0.45  
1      0.34          0.77  
2      0.00          0.00  
3      0.40          0.70  
4      0.45          0.45  


In [8]:
# Train classification model
topic_distributions = lda.transform(tfidf_matrix)
print(topic_distributions)

binary_labels = [1 if label == 'positive' else 0 for label in sentiment_labels]
X_train, X_test, y_train, y_test = train_test_split(topic_distributions, binary_labels, test_size=0.2, random_state=42)

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

[[0.11282559 0.88717441]
 [0.09932344 0.90067656]
 [0.74994102 0.25005898]
 [0.16026968 0.83973032]
 [0.16138827 0.83861173]
 [0.13122904 0.86877096]
 [0.1757173  0.8242827 ]
 [0.18932917 0.81067083]
 [0.211841   0.788159  ]
 [0.74994102 0.25005898]
 [0.11437848 0.88562152]
 [0.17643294 0.82356706]
 [0.11825131 0.88174869]
 [0.13490816 0.86509184]
 [0.13367392 0.86632608]
 [0.1929149  0.8070851 ]
 [0.11354022 0.88645978]
 [0.17412224 0.82587776]
 [0.15769856 0.84230144]
 [0.09273815 0.90726185]
 [0.22525251 0.77474749]
 [0.74994102 0.25005898]
 [0.2223207  0.7776793 ]
 [0.14091966 0.85908034]
 [0.15772237 0.84227763]
 [0.21188904 0.78811096]
 [0.58574408 0.41425592]
 [0.74994102 0.25005898]
 [0.19081728 0.80918272]
 [0.13817592 0.86182408]
 [0.74994102 0.25005898]
 [0.08370792 0.91629208]
 [0.74994102 0.25005898]
 [0.56450078 0.43549922]
 [0.11239248 0.88760752]
 [0.21068015 0.78931985]
 [0.74994102 0.25005898]
 [0.30227229 0.69772771]
 [0.18207966 0.81792034]
 [0.74994102 0.25005898]
