In [None]:
# Import packages

import pandas as pd
from task2_cleaning import exclude_february, data_segmentation
from nlp import obtain_corpus, normalise_corpus, build_feature_matrix, get_topics_terms_weights, print_topics_udf
from sklearn.decomposition import LatentDirichletAllocation
import os
import nltk

In [2]:
nltk.download('averaged_perceptron_tagger')  
nltk.download('punkt')
nltk.download('punkt_tab')  
nltk.download('omw-1.4')  
nltk.download('stopwords')          
nltk.download('wordnet')           

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/raymondguo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/raymondguo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/raymondguo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raymondguo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raymondguo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raymondguo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
nltk.word_tokenize("Here's a sentence with contractions.")

['Here', "'s", 'a', 'sentence', 'with', 'contractions', '.']

#### Data Cleaning

In [5]:
# Filter out February months
df_control = pd.read_excel('../data_source/CDS_25_Task2.xlsx', 'C Control')
df_control = exclude_february(df_control)

In [6]:
df_pilot = pd.read_excel('../data_source/CDS_25_Task2.xlsx', 'C Pilot')
print(df_pilot['TO_CHAR'].astype)

<bound method NDFrame.astype of 0      2023-02-01 00:00:00
1      2023-02-01 00:00:00
2      2023-02-01 00:00:00
3      2023-02-01 00:00:00
4      2023-02-01 00:00:00
              ...         
262        01/06/2company3
263        01/06/2company3
264        01/06/2company3
265        01/06/2company3
266        01/06/2company3
Name: TO_CHAR, Length: 267, dtype: object>


In [7]:
# Clean the data to convert the word company into 02
df_pilot['TO_CHAR'] = df_pilot['TO_CHAR'].replace(to_replace='.*company.*', value='02', regex=True)
df_pilot['TO_CHAR'] = pd.to_datetime(df_pilot['TO_CHAR'], errors='coerce')

In [8]:
# Filter out February months
df_pilot = exclude_february(df_pilot)
df_pilot

Unnamed: 0,COLUMN_4,VOLT_FLAG,SURVEY_ID,SCORE,LTR_COMMENT,PRIMARY_REASON,TO_CHAR,CONNECTION_TIME,SALES_PERSON_SAT,SALES_FRIENDLY_SAT,COMMINICATION_SAT,FIRST_BILL_SAT,AGENT_KNOWLEDGE
20,pilot,yes,352299145,10,,"Customer Service,General,UK Legacy",NaT,10.0,,10.0,10.0,10.0,10.0
21,pilot,yes,351717614,8,,,NaT,8.0,8.0,8.0,8.0,8.0,7.0
22,pilot,,352217961,9,,,NaT,10.0,8.0,10.0,9.0,9.0,9.0
23,pilot,,351710544,7,,,NaT,7.0,7.0,7.0,7.0,7.0,8.0
24,pilot,yes,351645277,7,Prompt service,"General Services,Internet,Pricing",NaT,7.0,7.0,6.0,7.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,pilot,,359879046,8,They are efficient,"Customer Service,General,Processes/Journeys,Te...",NaT,9.0,,10.0,9.0,10.0,9.0
263,pilot,yes,361243832,7,Helpful and informative,,NaT,10.0,10.0,5.0,10.0,10.0,8.0
264,pilot,,361219755,10,The gentleman who dealt us was so helpful and ...,"Customer Service,General,Technician,UK Legacy",NaT,10.0,10.0,10.0,10.0,10.0,10.0
265,pilot,yes,359947966,0,,,NaT,,0.0,,,,


In [9]:
print(df_pilot.columns)
print(df_control.columns)

Index(['COLUMN_4', 'VOLT_FLAG', 'SURVEY_ID', 'SCORE', 'LTR_COMMENT',
       'PRIMARY_REASON', 'TO_CHAR', 'CONNECTION_TIME', 'SALES_PERSON_SAT',
       'SALES_FRIENDLY_SAT', 'COMMINICATION_SAT', 'FIRST_BILL_SAT',
       'AGENT_KNOWLEDGE'],
      dtype='object')
Index(['COLUMN_4', 'VOLT_FLAG', 'SURVEY_ID', 'SCORE', 'LTR_COMMENT',
       'PRIMARY_REASON', 'TO_CHAR', 'CONNECTION_TIME', 'SALES_PERSON_SAT',
       'SALES_FRIENDLY_SAT', 'COMMINICATION_SAT', 'FIRST_BILL_SAT',
       'AGENT_KNOWLEDGE'],
      dtype='object')


In [10]:
# Combine the datasets, since the columns are exactly the same
df = pd.concat([df_control, df_pilot], ignore_index=True)
# Segment dataset
df_v, df_nv, df_v_control, df_nv_control, df_v_treatment, df_nv_treatment = data_segmentation(df)

In [11]:
# Confirm all data segments are correct
print(len(df_v))
print(len(df_nv))
print(len(df_v_control))
print(len(df_nv_control))
print(len(df_v_treatment))
print(len(df_nv_treatment))

241
341
158
177
83
164


#### Topic Modelling and Theme Discovery

##### Clean Corpus


In [12]:
general_corpus = obtain_corpus(df)
norm_corpus = normalise_corpus(general_corpus)
vectoriser, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf')

total_topics = 2
lda = LatentDirichletAllocation(n_topics=total_topics, max_iter=100, learning_method='online', learning_offset=50., random_state=42)
lda.fit(tfidf_matrix)

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - '/Users/raymondguo/nltk_data'
    - '/Users/raymondguo/opt/anaconda3/envs/cowryenv/nltk_data'
    - '/Users/raymondguo/opt/anaconda3/envs/cowryenv/share/nltk_data'
    - '/Users/raymondguo/opt/anaconda3/envs/cowryenv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
feature_names = vectoriser.get_feature_names()
weights = lda.components_

topics = get_topics_terms_weights(weights, feature_names)
print_topics_udf(topics=topics, total_topics=total_topics, num_terms=8, display_weights=True)