In [1]:
# Load useful libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import nltk
import re
import unidecode
import langdetect

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF 

import itertools


In [2]:
# Read the data 
data = pd.read_csv('Data/labeled_data.csv', engine='python', encoding = 'utf-8')

In [4]:
# Keep only the features we are interested in by dropping the useless columns
data.drop(['battery_overheat','camera','connectivity','memory_storage','sound','water_damage'], axis=1, inplace=True)

In [None]:
data.to_csv('Data/labeled_data.csv',index=False)

# 1. Selecting and fine tuning a method

## 1.1. Feature Engineering

In [None]:
# Check that there is only English comments in the dataset.

def detectEnglish(text):
    try :
        return(langdetect.detect(text) == 'en')
    except :
        return('issue')   
    
to_keep = data.text.apply(detectEnglish)

# Issues spotted --> remove them
to_drop = data.iloc[to_keep[to_keep == 'issue'].index]
data = data.drop(to_drop.index)

data.to_csv('Data/labeled_data.csv',index=False)

In [5]:
# Split the data between train & test for fine tunign the algorithms before making the predictions.
df_train, df_test = train_test_split(data, test_size=0.2, random_state=42, stratify=data.issue)

In [6]:
# Define a stopwords dictionnary :
stopwords = nltk.corpus.stopwords.words('english')

# Keep the negative adverbs
stopwords.remove('no')
stopwords.remove('not')

In [7]:
# We want to keep the negative indicators (e.g. wouldn't --> keep not). 
# So we need to expand common English contractions
# To do so, we use a bit of code from StackOverFlow



# this code is not mine! i shamelessly copied it from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# all credits go to alko and arturomp @ stack overflow.
# basically, it's a big find/replace.

cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text.lower())

In [8]:
def preprocessing(string):
    string = str(string)
    # lower_case
    string = string.lower()
    # remove accents
    string = unidecode.unidecode(string)
    # expand English contractions
    string = expandContractions(string)
    # remove stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
    string = pattern.sub('', string)
    
    
    # remove iphone
    pattern = re.compile(r'iphone')
    string = pattern.sub('', string)
    # remove apple
    pattern = re.compile(r'apple')
    string = pattern.sub('', string)
    # remove samsung
    pattern = re.compile(r'samsung')
    string = pattern.sub('', string)
    # remove galaxy
    pattern = re.compile(r'galaxy')
    string = pattern.sub('', string)
    
    
    # remove \n
    string = string.replace('\n', ' ')
    # remove special caracters like "" and punctuation
    string = re.sub('[^A-Za-z0-9 ]','', string)
    # lematize
    string = nltk.stem.wordnet.WordNetLemmatizer().lemmatize(string,"v")
    string = nltk.stem.wordnet.WordNetLemmatizer().lemmatize(string,"a")
    string = nltk.stem.wordnet.WordNetLemmatizer().lemmatize(string)
    return(string)

In [9]:
# Compute TFIDF on 1,2,3 - grams

TFIDF = TfidfVectorizer(
      input='content',
      lowercase=False,
      preprocessor=preprocessing,
      ngram_range=(1,3))

# Compute the TFIDF matrix (+create a dictionnary ...)
tfidf_train = TFIDF.fit_transform(df_train.text)
tfidf_test = TFIDF.transform(df_test.text)

In [10]:
# Reduce dimension by using NMF
n_dimensions = 40

In [11]:
#NMF dimensionality reduction 
NMF_model = NMF(n_components=n_dimensions, random_state=42, alpha=.1, l1_ratio=.5)

X_train = pd.DataFrame(NMF_model.fit_transform(tfidf_train))
X_test = pd.DataFrame(NMF_model.transform(tfidf_test))

## 1.2. Building a predictive model for each issue

Our approach consists in building 7 binary-classifiers (one for each issue). As the data is very noisy with a lot of comments that do not talk about any issue, we find a lot of False Positives (comments actuallly related to an issue, but categorized as not talking about the issue).

The business case we are treating consists in filtering the enormous amount of data produced everyday on social media for identifying smartphone issues that might weaken the position of a smartphone manufacturer. 

Therefore our target metric is the F-score for predictions of issues.

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [15]:
hyperparameters = pd.DataFrame(index = data.columns[1:8], columns = ['learning_rate','max_depth'])

In [16]:
hyperparameters

Unnamed: 0,learning_rate,max_depth
apps_update,,
battery_life_charging,,
customerservice,,
locking_system,,
screen,,
software_bugs,,
system,,


In [17]:
# After having tried a couple algorithms (random forest, SVM, AdaBoost, GradientBoosting) we decided to use
# GradientBoosting

# The following chunk helps us fine-tuning the hyperparameters --> it takes a while

learning_rate_list = np.linspace(0.5,2,16)
depth_list = [3,4]

to_tune = []

for element in itertools.product(learning_rate_list,depth_list):
    to_tune += [element]
    


for issue in hyperparameters.index :
    
    print(issue, "being treated")
    
    f_score = []
    
    y_train_i = df_train[issue]
    y_test_i = df_test[issue]
    
    for param in to_tune :
        model = GradientBoostingClassifier(n_estimators=1000, random_state=42,\
                                           learning_rate=param[0],\
                                           max_depth=param[1])
        model.fit(X_train, y_train_i)
        f_score += [float(classification_report(y_test_i, model.predict(X_test)).split()[12])]
    
    index_best_param = np.argmax(f_score)
    best_param = to_tune[index_best_param]
    
    hyperparameters.loc[issue] = best_param

apps_update being treated
battery_life_charging being treated
customerservice being treated
locking_system being treated
screen being treated
software_bugs being treated
system being treated


In [18]:
hyperparameters.to_csv("data/hyperparameters.csv")

hyperparameters

Unnamed: 0,learning_rate,max_depth
apps_update,2.0,3
battery_life_charging,0.7,4
customerservice,0.9,3
locking_system,0.9,3
screen,1.0,3
software_bugs,1.4,4
system,1.5,4


# 2. Making predictions with our final model

In [19]:
df_learn = pd.read_csv('Data/labeled_data.csv', engine='python', encoding = 'utf-8')
df_predict = pd.read_csv('data/test_data.csv', engine='python', encoding = 'utf-8')

In [20]:
# We remove the variables that we do not have to predict in df_predict
to_remove = [issue for issue in df_predict.columns\
           if issue not in hyperparameters.index]
# But we definitely keep 'text' 
to_remove.remove('text')

df_predict.drop(to_remove, axis = 1, inplace=True)

In [21]:
TFIDF = TfidfVectorizer(
      input='content',
      lowercase=False,
      preprocessor=preprocessing,
      ngram_range=(1,3))

tfidf_learn = TFIDF.fit_transform(df_learn.text)
tfidf_predict = TFIDF.transform(df_predict.text)

n_dimensions = 40
NMF_model = NMF(n_components=n_dimensions, random_state=42, alpha=.1, l1_ratio=.5)

X_learn = pd.DataFrame(NMF_model.fit_transform(tfidf_learn))
X_predict = pd.DataFrame(NMF_model.transform(tfidf_predict))

In [22]:
# Make the predictions
for issue in hyperparameters.index :
    
    print(issue, "being predicted")
    
    y_learn_i = df_learn[issue]
    param = list(hyperparameters.loc[issue])
    model = GradientBoostingClassifier(n_estimators=1000, random_state=42,\
                                            learning_rate=param[0],\
                                            max_depth=param[1])
    model.fit(X_learn, y_learn_i)
    
    df_predict[issue] = model.predict(X_predict)

# Output the results
df_predict.to_csv('Data/predicted_data.csv')

apps_update being predicted
battery_life_charging being predicted
customerservice being predicted
locking_system being predicted
screen being predicted
software_bugs being predicted
system being predicted


In [23]:
import random

for issue in hyperparameters.index :
    print("*************************", issue, "*************************")
    s = sum(df_predict[issue])
    print("Nb of issues predicted:",s)
    print('\n')
    to_print = random.sample(list(df_predict[df_predict[issue]==1].text), 10)
    for elt in to_print:
        print(elt)
    print('\n')

************************* apps_update *************************
Nb of issues predicted: 119.0


it processes quickly and takes great pictures, but the battery life is not as good as my s7 active's was at 2 years old.
good battery life.
the battery lasts much longer than the s6 edge, the index finger security option is very convenient for on the go, and i love the orchid gray color!
i could not even just pay the difference for an s8+ with (theoretically) 30% more battery life.
impressed with updated features, including improved camera and increased battery life.
great product definetly recommend to anyone great battery life
been using this phone for a few months now, generally happy with the performance and battery life.
great features, beautiful design and execution and for me significantly improved battery life over my previous galaxy s5!
i have noticed the battery holds its charge much longer, but i suspect that would be true for any new phone compared to one that's almost two years 

Unfortunately we see that we do not managed to predict as well as expected with our simple model, indeed :
* A lot of positive comments fall between the cracks of the models
* Some negative comments are ill-classified (e.g. this comment was categorized as `software_bugs` but would have better been categorised as `locking_system` : _"the cons include unlocking my phone with finger prints doesn't always work"_.)

However, issues regarding `screen` and `locking_system` seem to be quite well categorized.


We believe that the points mentionned are due to the simplicity of the model, the lack of enough training data and the structure of the data :
* an overwhelming ratio of positive comments in what was scrapped. We think that people should have limited themselves to scrap negative comments by using metadata (number of stars, number of angry reacts on FB, etc.).
* `screen` and `locking_system` are features that people often complaint so the models captured them more easily.
* As the topic `system` is very vague, we think that a lot of people put in it issues that did'nt match a specific category. This is why we predict fairly well negative comments in it ...