In [13]:
import numpy as np
import pandas as pd
from pprint import pprint
from time import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [24]:
categories = pd.read_csv('/home/e00401/Documents/IC_11_6_2018/b2b-b2c-classifier-master/train_b2b_b2c.csv')
em = categories.dropna(axis=0)


In [25]:
em.sample(3)

Unnamed: 0,Description,Title
486,custom upholstery furniture | slipcover sofas...,B2B
592,"charlotte russe: fashion women's clothing, dr...",B2C
910,quality workwear & apparel | dickies official...,B2C


In [26]:
em['Title'].value_counts()

B2B    568
B2C    382
Name: Title, dtype: int64

In [27]:
def pre_process_text(textArray):
    #If using stemming...
    #stemmer = PorterStemmer()
    wnl = WordNetLemmatizer()
    processed_text = []
    for text in textArray:
        words_list = (str(text).lower()).split()
        final_words = [wnl.lemmatize(word) for word in words_list if word not in stopwords.words('english')]
        #If using stemming...
        #final_words = [stemmer.stem(word) for word in words_list if word not in stopwords.words('english')]
        final_words_str = str((" ".join(final_words)))
        processed_text.append(final_words_str)
    return processed_text

em['Description'] = pre_process_text(em['Description'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [28]:
categories = [ 'B2C', 'B2B']

In [29]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
]);

In [30]:
parameters = {
    'vect__max_df': (0.5, 1.0),#0.6, 0.7, 0.8, 0.9, 1.0),
    'vect__max_features': (None, 1000, 5000),#2000, 3000, 4000, 5000, 6000, 10000, 20000, 30000, 40000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),#, (1, 3)),  # unigrams or bigrams or trigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.1, 0.01, 0.001),#, 0.0001, 0.00001, 0.000001, 0.0000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50)#, 100, 200, 300, 400, 500, 100),
}

In [31]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, refit=True)

print("Grid Search started\n---------------------------------------")
print("Pipeline:", [name for name, _ in pipeline.steps])
print("Grid Search Parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(np.array(em['Description']), np.array(em['Title']))
print("done in %0.3fs\n----------------------------------------------" % (time() - t0))

print("Best Score: %0.3f\n-------------------------------------------" % grid_search.best_score_)
print("Best Parameters:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Grid Search started
---------------------------------------
Pipeline: ['vect', 'tfidf', 'clf']
Grid Search Parameters:
{'clf__alpha': (0.1, 0.01, 0.001),
 'clf__n_iter': (10, 50),
 'clf__penalty': ('l2', 'elasticnet'),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 1.0),
 'vect__max_features': (None, 1000, 5000),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 576 candidates, totalling 1728 fits




[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.0s








[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.0min














[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.9min




















[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 12.5min


























[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 19.7min




























[Parallel(n_jobs=-1)]: Done 1728 out of 1728 | elapsed: 27.4min finished


done in 1645.723s
----------------------------------------------
Best Score: 0.808
-------------------------------------------
Best Parameters:
	clf__alpha: 0.001
	clf__n_iter: 10
	clf__penalty: 'l2'
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__max_df: 1.0
	vect__max_features: None
	vect__ngram_range: (1, 2)




In [33]:
import os
import json
df=pd.DataFrame(columns = ['URL','B2C_B2B'])
for fl in os.listdir("/home/e00401/Desktop/output_files"):
        print(fl)
        print("----------Starting-------")
        test_set=[]
        with open("/home/e00401/Desktop/output_files/"+fl,'r') as f:
                d = json.load(f)
                if d.get("all_text"):
                    test_set.append(d.get("all_text"))
                data=grid_search.best_estimator_.predict(np.array(test_set))
                print(data)
                raw_data = {'URL':fl , 'B2C_B2B':data,
                }
#                 print(raw_data)
                df=df.append([raw_data])
print(df)
df.to_csv("test_b2c.csv")

ppbi.com.txt
----------Starting-------
['B2C']
homedepot.com.txt
----------Starting-------
['B2C']
polyvore.com.txt
----------Starting-------
['B2C']
boykin.com.txt
----------Starting-------
['B2C']
customculinary.com.txt
----------Starting-------
['B2B']
heritagegolfgroup.com.txt
----------Starting-------
['B2C']
kayak.com.txt
----------Starting-------
['B2C']
softchoice.com.txt
----------Starting-------
['B2B']
instamed.com.txt
----------Starting-------
['B2B']
ba-reps.com.txt
----------Starting-------
['B2B']
brooksrunning.com.txt
----------Starting-------
['B2C']
newwestrecords.com.txt
----------Starting-------
['B2B']
stumptowncoffee.com.txt
----------Starting-------
['B2B']
leedsworld.com.txt
----------Starting-------
['B2C']
firstbankonline.com.txt
----------Starting-------
['B2C']
vfc.com.txt
----------Starting-------
['B2B']
telvue.com.txt
----------Starting-------
['B2B']
cbsinteractive.com.txt
----------Starting-------
['B2C']
bachmans.com.txt
----------Starting-------
['B2C

['B2C']
mophie.com.txt
----------Starting-------
['B2C']
cedarfair.com.txt
----------Starting-------
['B2B']
zumiez.com.txt
----------Starting-------
['B2C']
nytimes.com.txt
----------Starting-------
['B2C']
northpointe.com.txt
----------Starting-------
['B2C']
userfriendlymedia.com.txt
----------Starting-------
['B2B']
mlive.com.txt
----------Starting-------
['B2C']
gilchristsoames.com.txt
----------Starting-------
['B2C']
ramybrook.com.txt
----------Starting-------
['B2C']
lagos.com.txt
----------Starting-------
['B2C']
clearent.com.txt
----------Starting-------
['B2B']
goldcoastbroadcasting.com.txt
----------Starting-------
['B2B']
tristatecamera.com.txt
----------Starting-------
['B2B']
fye.com.txt
----------Starting-------
['B2C']
smashbox.com.txt
----------Starting-------
['B2C']
saintbernard.com.txt
----------Starting-------
['B2C']
enervee.com.txt
----------Starting-------
['B2B']
atlasobscura.com.txt
----------Starting-------
['B2B']
peoplesbanknc.com.txt
----------Starting---

In [24]:
test_set = ['trade analytics trade cost analysis trade post fx trade analytics trade analysis post trade trading cost analysis transaction cost analysis transition management transaction cost analysis post trade analysis fx tca fx transaction cost analysis transition management fx transaction cost analysis transition management transaction cost analysis equity post trade analytics pre trade analysis transition manager transaction costs analysis global tca fxtransparency pre trade analytics transaction cost analysis providers fx tca forex tca global trade compliance management global trade compliance analytics sec trading compliance compliance management equities compliance compliance plus compliance abel noser compliance compliance analytics tca compliance hedge funds buy-side sell-side compliance buy side compliance broker dealer compliance compliance oversight risk management abel noser - the leaders in trade analytics, institutional agency trading, global compliance services and transition management.\u00a0\u00a0 your advocate in a changing financial world.\u00a0 leave the data to us!\u00a0  about usabout abel nosermanagementnewsanalyticsproducts & servicespre-trade tca l\u00a0 trade compassreal-time tca\u00a0 l\u00a0 trade pulsepost-trade tca\u00a0 l\u00a0 trade-zoomzeno an solutionsmifid ii & priipsbig datavideoscompliancecompliance solutionscompliance plusliquidity plustradingtrading servicesportfolio executioncommission managementalgorithm suitesoft-dollar servicestransition mgmtconsultingour products & servicesabel noser has long been respected as a leader in the campaign to lower the costs associated with trading. to that end, our brokerage provides a range of trading services and trade analytics to institutional asset owners, investment managers and brokers. abel noser holdings llc is also the industry leading provider of transaction cost analysis (tca) services through our subsidiary abel noser solutions. over 500 global clients subscribe to our multi-asset tca and compliance products through a network of resellers, distribution partners and strategic alliances.our products & services\u00a0\u00a0ted morganchief executive officerabel noser holdings\u00a0presidentabel noser, llcpeter weilerpresidentabel noser solutionssteve glassprincipalzeno an solutionsrandy newchief technology officerabel noser solutionsmatthew sullivanchief financial officerabel noser holdingsed danielechief compliance officerabel noser, llcstanley abelchairman emeritusgene noserchairman emeritusquestions?\u00a0 ask us ...abel noser offers institutional investors trade cost measurement, compliance products, goal-oriented trading, and low commission rates to help our clients improve their investment performance and support a rigorous compliance process.\u00a0abel noser, llc is an agency-only broker, finra & sipc registered.\u00a0\u00a0our missionto stay up-to-date with abel noser, please connect with us on social media or email us at info@abelnoser.com.\u00a0careersnewssitemap\u00a0contact usglobalabout usanalyticscompliancetradingtransition mgmtconsultingx   about usabout abel nosermanagementnewsanalyticsproducts & servicespre-trade tca l\u00a0 trade compassreal-time tca\u00a0 l\u00a0 trade pulsepost-trade tca\u00a0 l\u00a0 trade-zoomzeno an solutionsmifid ii & priipsbig datavideoscompliancecompliance solutionscompliance plusliquidity plustradingtrading servicesportfolio executioncommission managementalgorithm suitesoft-dollar servicestransition mgmtconsultingour products & servicesabel noser has long been respected as a leader in the campaign to lower the costs associated with trading. to that end, our brokerage provides a range of trading services and trade analytics to institutional asset owners, investment managers and brokers. abel noser holdings llc is also the industry leading provider of transaction cost analysis (tca) services through our subsidiary abel noser solutions. over 500 global clients subscribe to our multi-asset tca and compliance products through a network of resellers, distribution partners and strategic alliances.our products & services\u00a0\u00a0ted morganchief executive officerabel noser holdings\u00a0presidentabel noser, llcpeter weilerpresidentabel noser solutionssteve glassprincipalzeno an solutionsrandy newchief technology officerabel noser solutionsmatthew sullivanchief financial officerabel noser holdingsed danielechief compliance officerabel noser, llcstanley abelchairman emeritusgene noserchairman emeritusquestions?\u00a0 ask us ...abel noser offers institutional investors trade cost measurement, compliance products, goal-oriented trading, and low commission rates to help our clients improve their investment performance and support a rigorous compliance process.\u00a0abel noser, llc is an agency-only broker, finra & sipc registered.\u00a0\u00a0our missionto stay up-to-date with abel noser, please connect with us on social media or email us at info@abelnoser.com.\u00a0careersnewssitemap\u00a0contact usglobalabout usanalyticscompliancetradingtransition mgmtconsultingx about usabout abel nosermanagementnewsanalyticsproducts & servicespre-trade tca l\u00a0 trade compassreal-time tca\u00a0 l\u00a0 trade pulsepost-trade tca\u00a0 l\u00a0 trade-zoomzeno an solutionsmifid ii & priipsbig datavideoscompliancecompliance solutionscompliance plusliquidity plustradingtrading servicesportfolio executioncommission managementalgorithm suitesoft-dollar servicestransition mgmtconsulting\u00a0\u00a0\u00a0request a demorequest a demoabel noser opens new office in atlantaaugust 13, 2018expansion to support clients located in the southeast and comes on the heels of company\u00e2\u0080\u0099s recent acquisition of another leading tca providerread more >abel noser holdings closes zeno consulting group acquisitionjune 4, 2018acquisition combines the offerings of the two pioneers and leading providers of trade analytics services to the institutional community.read more >ans teams up with factset research systems for multi\u00e2\u0080\u0093asset best execution and regulatory reportingdecember 6, 2017partnership provides fund managers with comprehensive products and best execution services.read more >\u00a0see more news >there a reason why so many firms trust abel noserdata securityclient advocacyfocused on valuewe strive to go beyond industry standards to make sure your financial data is kept\u00a0 confidential and secure. scores of clients have entrusted their trade data to us. so can you.our client services group is top-notch; from ultra-quick response times and custom reporting to a dedicated service team who can focus on your real-time issues.we never trade for our own account, ever. this means our mission is squarely focused on achieving your goals more quickly and more cost-effectively than the competition.learn morelearn morelearn more\u00a0read more ...contact our global officebenchmark clarity = peace-of-minduse abel noser comprehensive trade surveillance and analytics tools to bring your trade compliance, mifid ii, priips and liquidity risk management programs up-to-date.learn moreclient services$7.5tannual equities data70equity exchanges54ksecurities650kfixed income securitieswant a free demo trial?click below to learn how our powerful end-to-end trading and transaction surveillance can add value; and save you money.\u00a0\u00a0get started\u00a0careersnewssitemap\u00a0contact usglobalwhat we do\u00a0request a demo\u00a0about usanalyticscompliancetradingtransition mgmtconsultingx.trusted partner, lower costsabel noser opens new office in atlantaabel noser holdings closes zeno consulting group acquisitionans expands london office to address growing market demand for its tca and compliance productsread articles']

In [25]:
grid_search.best_estimator_.predict(np.array(test_set))

array(['B2C'], dtype='<U3')