# Data Cleaning - Service Innovation Dataset

Author: Rafael Ballestiero

In [320]:
import os, re
import pickle as pkl
import pandas as pd
import numpy as np

In [332]:
df = pd.read_csv("./data/service_innovation/raw.csv", header=0).dropna(subset=["Abstract"])

In [221]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 301 entries, 0 to 301
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Year       301 non-null    int64 
 1   Reference  301 non-null    object
 2   Abstract   301 non-null    object
dtypes: int64(1), object(2)
memory usage: 9.4+ KB


## Drop Duplicates

First, we drop all rows that are extact duplicates of each other.

In [222]:
df = df.drop_duplicates()

In [223]:
assert ~df.duplicated(subset="Abstract").any() or ~df.duplicated(subset="Reference").any()

As pointed out, there are some papers that are referenced twice with very slight variations

In [224]:
duplicate_references = [
    "Tong, C., Nagarajan, M., & Cheng, Y. (2016). Operational impact of service innovations in ",  
    "Easton, F. F., & Pullman, M. E. (2001). Optimizing service attributes: The seller\'s utility problem", 
    "Xue, M., Hitt, L. M., & Harker, P. T. (2007). Customer efficiency",
]

In [225]:
duplicate_indices = []
for ref in duplicate_references:
    ref_indices = df[df["Reference"].str.startswith(ref)].index
    
    if len(ref_indices) > 1:
        duplicate_indices += ref_indices[1:].tolist()
    
df = df.drop(duplicate_indices).reset_index(drop=True)

## Clean References

Given that the title includes information about the content of the paper, we want to be able to parse the titles from the provided references. The variety of formatting in the references makes this task difficult, although through a mixture of regex and manual changes to references we are able to extract what is needed.

In [243]:
title_pattern = re.compile(u"[^\d]*\([0-9]{4}\)\.?\s*([^\.]+\.?).*")

titles = df.Reference.str.extract(title_pattern)[0]

### Verify Titles

In order to verify the titles that have been parsed, we first look at all parsed titles that contain any numbers. Although such title occurances could be valid, they may also indicate poor formatting.

In [244]:
titles[titles.str.contains("\d", regex=True)]

31     Switch to web 2-0 boosts business agility for ...
71             Customer experience blueprint drives B2B.
142    Leveraging smart specialisation strategies (RI...
174    Diffusion dynamics of sustainable innovation-i...
227    Developing a product-service system through a ...
231    Management approaches for industry 4-0 - the o...
247    Prosocial Compliance in P2P Lending: A Natural...
278    Environmental benefits of internet-enabled C2C...
Name: 0, dtype: object

In [228]:
# title before year
df.loc[13, "Reference"] = "(2002). Director's forum. Laboratory Equipment, 38(12), 8. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=6395811&site=ehost-live"

# 2.0 cuts title short
df.loc[31, "Reference"] = 'Mohamed, A. (2007). Switch to web 2-0 boosts business agility for internet services firm. Computer Weekly, , 12-12. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=25040565&site=ehost-live'

# title before year
df.loc[33, "Reference"] = '(2007). Translucent green: Environmentally-friendly manufacturing processes are key concern of retailers and brands. Textile World, 157(6), 49-49. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=27662701&site=ehost-live'

# title before year
df.loc[40, "Reference"] = '(2008). Spiegel expands use of yunique software. Textile World, 158(4), 56-56. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=33767368&site=ehost-live'

# title before year
df.loc[49, "Reference"] = '(2009). Service experience and service design: Concepts and application in tourism SMEs. Managing Service Quality, 19(3), 332-349. doi:10.1108/09604520910955339 Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=39983391&site=ehost-live'

# no period to finish title
df.loc[126, "Reference"] = 'Comerio, M., Batini, C., Castelli, M., Grega, S., Rossetti, M., & Viscusi, G. (2015). Service portfolio management: A repository-based framework. doi:10.1016/j.jss.2015.01.055'

# title before year
df.loc[151, "Reference"] = "(2016). How to create a 'lights out' customer experience. Ivey Business Journal, , 5-7. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=113229191&site=ehost-live"

# no period to finish title
df.loc[158, "Reference"] = 'Peng, K., & Lin, P. M. C. (2016). Social entrepreneurs: Innovating rural tourism through the activism of service science. doi:10.1108/IJCHM-12-2014-0611'

# no period to finish title
df.loc[208, "Reference"] = 'Woo, J. (2017). How Chinese commercial banks innovate: process and practice. Journal of Innovation Management, 5(2), 81-110.'

# title before year
df.loc[214, "Reference"] = '(2018). Achieving competitive advantage. Strategic Direction, 34(10), 25-27. doi:10.1108/SD-06-2018-0145 Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=132294917&site=ehost-live'

# 4.0 cuts title short
df.loc[231, "Reference"] = 'MOHELSKA, H., & SOKOLOVA, M. (2018). Management approaches for industry 4-0 - the organizational culture perspective. Technological & Economic Development of Economy, 24(6), 2225-2240. doi:10.3846/tede.2018.6397 Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=133233122&site=ehost-live'

titles = df.Reference.str.extract(title_pattern)[0]

### Store Cleaned df

In [273]:
df.to_csv("./data/service_innovation/clean.csv")

## Preprocess Corpus

In [277]:
from gensim.parsing import preprocessing
import pattern.en as en

### Remove Windows formatted punctuaction

The raw csv was produced with a `cp1252` encoding which lead to mistmatches in punctuation.

In [279]:
cp1252_pattern = re.compile(u"“|”|’|‘|—|–|–|\?")
def strip_cp1252_punctuation(s):
    return re.sub(cp1252_pattern, " ", s)

### Lemmatize

We want to lemmatize the words in the abstracts to improve the clustering performance, while excluding the abbreviations that appear in them.

In [333]:
abbreviations = set(["pss", "iot", "abs", "business", "exs", "ict", "npd", "bmi", "iot", "cem", "sst", "ic", "qos", "oi", "om", "psf", "ai", "bm", "bo", "mc", "mosp", "msd", "pssldm", "fcbpss", "ffe", "fmea", "fo", "iis", "sc", "sdl", "si", "sp", "vsm", "xe", "cad", "cdf", "clscs", "cmm", "cx", "ks", "odf", "sspss", "bma", "bpm", "bsc", "exs", "fof", "kibs", "lbd", "lo", "moss", "plm", "pnss", "prs", "qfd", "sem", "som", "sp", "acm", "adkar", "catwoe", "cc", "ces", "cis", "cit", "clv", "cxm", "dfsi", "dsic", "fepss", "fsqca", "ilp", "moa", "mosc", "ri", "rpn", "rrs", "rsp", "scm", "slr", "sna", "spesa", "spss", "ssm", "sta", "tr", "vo", "wips"])

with open("./data/service_innovation/abbreviations.pkl", "wb") as abbfile:
    pkl.dump(abbreviations, abbfile)

In [334]:
def lemmatize(s):
    return " ".join([en.lemma(w) if w not in abbreviations else w for w in s.split()])

### Common Terms and Abbreviations

In [287]:
common_terms = ["service", "research", "study", "paper", "result", "based", "literature", "article", "focus"]
def strip_common_terms(s):
    return " ".join([w for w in s.split() if w not in common_terms])

### Synonyms

In [288]:
synonyms = {"servitization": "servitisation", "consumer": "customer"}
def standardize_synonyms(s):
    return " ".join([synonyms.get(w, w) for w in s.split()])

### Apply Preprocessing

We want to apply all the functions defined above, as well as some from the Gensim library, and store the result.

In [325]:
corpus = (titles + " " + df.Abstract)

corpus = corpus.apply(preprocessing.preprocess_string, filters=[
    lambda s: s.strip().lower(),
    strip_cp1252_punctuation,
    preprocessing.strip_punctuation, 
    preprocessing.strip_tags,
    preprocessing.strip_multiple_whitespaces, 
    preprocessing.strip_numeric, 
    preprocessing.remove_stopwords, 
    preprocessing.strip_short,
    lemmatize,
    strip_common_terms,
    standardize_synonyms
]).apply(" ".join)

corpus.to_csv("./data/service_innovation/corpus.csv", index=False)