# Load data

In [1]:
import pandas as pd
import numpy as np
np.random.seed(1)

# -- Load data
# Note: provided data was neither in JSON nor JSON lines format (http://jsonlines.org/). Fixed offline with:
# gunzip -k profile_type_training.csv.gz && cat profile_type_training.csv | sed 's#,$##' > profile_type_training.jsonl
all_data = pd.read_json('profile_type_training.jsonl', lines=True)
all_data = all_data[all_data.lang == 'en']
all_data['label'] = all_data['label'].apply(lambda x: 'influencer' if x == 'celebrity' else x)
print('Read data:\n', all_data['label'].value_counts())

Read data:
 influencer        49560
brand             27564
news and media    13773
Name: label, dtype: int64


# Create train and test sets

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    all_data.loc[:, ['username', 'name', 'bio', 'follower_count']], all_data.label, test_size=0.2, random_state=42)

print('Train samples:', len(y_train))
print('Test samples:', len(y_test))

Train samples: 72717
Test samples: 18180


# Create text feature extraction pipeline
Inspired by http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('sgd', SGDClassifier(max_iter=1000, tol=1e-3))
])

parameters = {
    'vect__max_df': (0.25, 0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
    'cal__base_estimator__sgd__alpha': (0.00001, 0.000001),
    'cal__base_estimator__sgd__penalty': ('l2', 'elasticnet')
}

# Isotonic fit should be ok as we have >> 1000 samples (http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html#sklearn.calibration.CalibratedClassifierCV).
calibrated_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('cal', CalibratedClassifierCV(base_estimator=pipeline, method='isotonic'))
])

grid_search = GridSearchCV(calibrated_pipeline, parameters, n_jobs=-1, verbose=1)

# Train and calibrate classifier

In [7]:
from pprint import pprint
from time import time
import logging

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

print('Performing grid search...')
t0 = time()
grid_search.fit(X_train.bio, y_train)
print('done in %0.3fs' % (time() - t0))
print()

print('Best score: {0:.1f}%'.format(grid_search.best_score_ * 100))
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:  5.9min finished


done in 367.485s

Best score: 85.6%
Best parameters set:
	cal__base_estimator__sgd__alpha: 1e-05
	cal__base_estimator__sgd__penalty: 'elasticnet'
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)


# Get best, calibrated classifier

In [9]:
clf = grid_search.best_estimator_

# Test set scoring

Now, we confirm that we get reasonable (80% + accuracy) using just the training set provided:

In [10]:
accuracy = clf.score(X_test.bio, y_test)
print("Accuracy: {0:.1f}%".format(accuracy * 100))

Accuracy: 86.5%


# Export model

In [13]:
from sklearn.externals import joblib

joblib.dump(clf, 'clf.pkl') 

['clf.pkl']

# Example inferences

Just for demonstration, import the model (as will be done in the API), then predict some categories from some of the bios in the test set.

In [92]:
loadedClf = joblib.load('clf.pkl') 

predictedProbabilities = loadedClf.predict_proba(X_test['bio'])
results = pd.DataFrame(predictedProbabilities, columns=loadedClf.classes_, index=X_test.index)
results = pd.concat([results, y_test, X_test.username, X_test.bio], axis=1)
results.head(20).style.format({
    'influencer': '{:.0%}',
    'brand': '{:.0%}',
    'news and media': '{:.0%}'
})

Unnamed: 0,brand,influencer,news and media,label,username,bio
49666,97%,0%,3%,brand,BirdsongGregory,"As a retail and B2B branding and marketing agency, we deliver tangible results for regional, national, and global clients. 704.332.2299"
104651,3%,96%,1%,influencer,conradcoates,"Actor and curious, evolving human being. https://t.co/WPPIn2DpX9 #Defiance"
50434,0%,100%,0%,influencer,TipsyHeelz,Blogger&Mom . Love all things Fashion and Beauty. Blog http://www.tipsyheelz.com Jewelry https://zazzy.co/designer/Glammore/
48818,90%,1%,9%,brand,Colnagoworld,"History, technology, passion : we are Colnago"
40350,99%,0%,1%,brand,obelisksys,Official Twitter for Obelisk Systems! We are one of Australia's first space hardware startups. Developing world class products for both industry and education.
128998,2%,0%,98%,news and media,SourceMag,"Source is Scotland's Number One student magazine, bringing you the best in careers, celebrity and student life every quarter!"
129047,15%,0%,85%,news and media,theimproper,The Improper Bostonian is the premier entertainment and lifestyle guide for the city of Boston.
116910,0%,1%,99%,news and media,mzansimusicmag,South Africa's Online Hottest Music Magazine
68988,3%,97%,0%,influencer,Mommyshangout,"#ATL #Mommy #Blogger, Marketing & Social Media Consultant who never leaves home with out her iPhone #teamIphone"
40705,97%,0%,3%,brand,EUEnvironment,"Official channel of the European Environment Agency (EEA), an agency of the European Union. We provide sound, independent information on Europe's environment."
