In [1]:
import graphlab
import re

In [2]:
# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)

This non-commercial license of GraphLab Create for academic use is assigned to quentin.picard@gmail.com and will expire on July 18, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1501982981.log


In [3]:
twits = graphlab.SFrame.read_csv('twits.csv', double_quote=False) #ignoring the parsing errors

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,int,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [4]:
twits.head(10)

User Name,body,created_at,id,sentiment,symbol
tactical,$AAUKF closing gap into model purchase zone on ...,2012-09-20T09:53:00Z,9604105,Bullish,AAUKF
Aaron Jackson,Most japanese ADRs are sitting near lows. V ...,2012-10-06T01:03:13Z,9858821,Bearish,MITSY
Ian,$SLW $RGLD $TRX Are Royalty Companies With ...,2013-06-27T00:42:21Z,14316534,Bullish,AAUKF
Market Impulses,$AXAHY moving towards the Objective after the ...,2014-04-08T09:30:12Z,21837298,Bearish,AXAHY
Marketmath,"$AAUKF AAL.L , seems to be forming a bearish ...",2014-08-04T11:05:10Z,25343226,Bearish,AAUKF
Michael Angelo Mustillo,Small bizjet orders picking up. ...,2014-08-12T15:02:18Z,25646970,Bullish,DASTY
etradebaby,$MTLS I think $12.82 would be a good spot to ...,2014-08-19T20:54:50Z,25917338,Bullish,DASTY
TraderX82,"$DASTY solid trend on the weekly, like the setup ...",2014-09-17T00:12:26Z,26864030,Bullish,DASTY
SpecialSits101,Would&#39;ve thought $FCX is the natural buyer for ...,2014-09-20T23:24:32Z,27028346,Bullish,REPYY
D.R.,$BNPQY $AIG $BCS $AXAHY $DB $SCGLY $OMVKY $EBKDY ...,2014-10-16T13:59:49Z,28059882,Bullish,AXAHY

username
TacticalQuant
ATMcharts
TheAmerican
Market_Impulses
marketmath
rocketPower
etradebaby
TraderX82
SpecialSits101
SFOscanner


In [5]:
len(twits)

7273

In [6]:
# remove English stopwords
twits['word_count'] = graphlab.text_analytics.count_words(twits['body']).dict_trim_by_keys(graphlab.text_analytics.stopwords(), True)


In [7]:
def clean_words(dico):
    for k, v in dico.items():
        if re.search('[0-9]', k)!=None or re.search('/', k)!=None or re.search('\$', k)!=None: #remove numbers/ stocks($) and http addresses
            dico.pop(k)
    return(dico)

In [8]:
twits['word_count'] = twits['word_count'].apply(clean_words)

In [9]:
twits.head(20)

User Name,body,created_at,id,sentiment,symbol
tactical,$AAUKF closing gap into model purchase zone on ...,2012-09-20T09:53:00Z,9604105,Bullish,AAUKF
Aaron Jackson,Most japanese ADRs are sitting near lows. V ...,2012-10-06T01:03:13Z,9858821,Bearish,MITSY
Ian,$SLW $RGLD $TRX Are Royalty Companies With ...,2013-06-27T00:42:21Z,14316534,Bullish,AAUKF
Market Impulses,$AXAHY moving towards the Objective after the ...,2014-04-08T09:30:12Z,21837298,Bearish,AXAHY
Marketmath,"$AAUKF AAL.L , seems to be forming a bearish ...",2014-08-04T11:05:10Z,25343226,Bearish,AAUKF
Michael Angelo Mustillo,Small bizjet orders picking up. ...,2014-08-12T15:02:18Z,25646970,Bullish,DASTY
etradebaby,$MTLS I think $12.82 would be a good spot to ...,2014-08-19T20:54:50Z,25917338,Bullish,DASTY
TraderX82,"$DASTY solid trend on the weekly, like the setup ...",2014-09-17T00:12:26Z,26864030,Bullish,DASTY
SpecialSits101,Would&#39;ve thought $FCX is the natural buyer for ...,2014-09-20T23:24:32Z,27028346,Bullish,REPYY
D.R.,$BNPQY $AIG $BCS $AXAHY $DB $SCGLY $OMVKY $EBKDY ...,2014-10-16T13:59:49Z,28059882,Bullish,AXAHY

username,word_count
TacticalQuant,"{'building': 1, 'purchase': 1, 'parti ..."
ATMcharts,"{'lows.': 1, 'trap?': 1, 'adrs': 1, 'japanese' ..."
TheAmerican,"{'form:': 1, 'royalty': 1, 'companies': 1, 'i ..."
Market_Impulses,"{'sell': 1, 'objective': 1, 'signal': 1, 'frid ..."
marketmath,"{'forming': 1, 'bearish': 1, ',': 1, 'scallop': 1, ..."
rocketPower,"{'up.': 1, 'small': 1, 'bizjet': 1, 'picking': ..."
etradebaby,"{'sell': 1, 'good': 1, 'spot': 1, 'short': 1, ..."
TraderX82,"{'sector': 1, 'weekly,': 1, 'trend': 1, ..."
SpecialSits101,"{'natural': 1, 'deep': 1, 'hearing': 1, 'thought': ..."
SFOscanner,"{'stockscan': 1, 'dividends': 1, ..."


In [10]:
twits['sentiment'].show(view='Categorical')

Canvas is accessible via web browser at the URL: http://localhost:55527/index.html
Opening Canvas in default web browser.


In [11]:
train_data,test_data = twits.random_split(.8)

In [12]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data,
                                                     max_iterations=20)

In [13]:
graphlab.canvas.set_target('ipynb')

In [14]:
sentiment_model.evaluate(test_data, metric='roc_curve')

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+----------------+------+-----+
 | threshold |      fpr       |      tpr       |  p   |  n  |
 +-----------+----------------+----------------+------+-----+
 |    0.0    |      1.0       |      1.0       | 1142 | 359 |
 |   1e-05   | 0.841225626741 | 0.994746059545 | 1142 | 359 |
 |   2e-05   | 0.821727019499 | 0.993870402802 | 1142 | 359 |
 |   3e-05   | 0.807799442897 | 0.99299474606  | 1142 | 359 |
 |   4e-05   | 0.796657381616 | 0.99299474606  | 1142 | 359 |
 |   5e-05   | 0.796657381616 | 0.99299474606  | 1142 | 359 |
 |   6e-05   | 0.788300835655 | 0.990367775832 | 1142 | 359 |
 |   7e-05   | 0.779944289694 | 0.989492119089 | 1142 | 359 |
 |   8e-05   | 0.779944289694 | 0.989492119089 | 1142 | 359 |
 |   9e-05   | 0.774373259053 | 0.988616462347 | 1142 | 359 |
 +-----------+----------------+----------------+------+-----+
 [100001 rows x 5 columns]

In [15]:
sentiment_model.show(view='Evaluation')

In [16]:
twits['predicted_sentiment'] = sentiment_model.predict(twits, output_type='probability')

In [17]:
twits = twits.sort('predicted_sentiment', ascending=False)
twits.head()

User Name,body,created_at,id,sentiment,symbol,username
Chris Waage,"$TDOC solid growth, solid market share, solid ...",2017-08-04T18:27:50Z,91050434,Bullish,TDOC,Waage
KC,$RAD one day we will wake up to the news of an PE ...,2017-08-02T17:05:17Z,90711828,Bullish,WBA,kcphaeton
Brian,$GES made a pretty penny. Still a LOT of room to ...,2017-08-04T08:18:26Z,90969776,Bullish,KR,brianq
Stefan,$ATVI Grabbing a bunch of calls tomorrow for an ...,2017-08-02T19:11:44Z,90734920,Bullish,ATVI,2Black2Strong
Bob,$QCOM Good News For Qualcomm. NXPI Reported ...,2017-08-04T12:57:53Z,90983676,Bullish,QCOM,HarvardGrad
Scott Price,$PANW huge move tomorrow up 5 bucks AH due CNBC ...,2017-08-03T22:06:34Z,90933840,Bullish,PANW,Nbeach
Ray Guardado,"$BBBY a lot of bullish indicators, bullish ...",2017-08-03T22:33:55Z,90936823,Bullish,BBBY,GuardCapital
Anthony Jones,$ATVI I normally don&#39;t bash stocks ...,2017-08-04T17:12:52Z,91036486,Bullish,ATVI,IamJ0NES
Bob,$QCOM Good News For Qualcomm. NXPI Reported ...,2017-08-03T22:49:49Z,90938465,Bullish,QCOM,HarvardGrad
Scott Gordon,$MSG Added ti position today (second add in 5 ...,2017-07-31T18:37:40Z,90448568,Bullish,MSG,scottagordon

word_count,predicted_sentiment
"{'company.': 1, 'scoop': 1, 'solid': 3, 'ride' ...",1.0
"{'purchase': 1, 'rad': 1, 'position': 1, 'deal' ...",1.0
"{'grow.': 1, 'penny.': 1, 'made': 1, 'room': 1, ...",1.0
"{'sentiment': 1, 'calls': 1, 'grabbing': 1, ...",1.0
"{'confirmed': 1, 'good': 1, 'financial': 1, ...",1.0
"{'target.': 1, 'huge': 1, 'cnbc': 1, 'goldman': 1, ...",1.0
"{'full': 1, 'accumulation;': 1, ...",1.0
"{'investment.': 1, 'overwatch': 1, 'gold': ...",1.0
"{'confirmed': 1, 'good': 1, 'financial': 1, ...",1.0
"{'days).': 1, 'great': 1, 'added': 1, 'price': 1, ...",1.0


In [18]:
#Most positive review
twits[0]['body']

'$TDOC solid growth, solid market share, solid company. Scoop up your shares now and ride the wave up!'

In [19]:
twits[1]['body']

'$RAD one day we will wake up to the news of an PE deal or Activist taking a position to get Rad on Track prior to $WBA closing on Purchase'

In [20]:
twits[2]['body']

'$GES made a pretty penny. Still a LOT of room to grow. Follow me for the best retail picks of the year. $AMC $VSI $GNC $KR $M all Extremely'

In [21]:
twits[-1]['body']

'$BA the Only thing that keeps this up is the war threat. Nobody is selling, I understand. But the longer peace remains the lower it goes'

In [22]:
twits[-2]['body']

'$AMAT This is a continuation from june 9th tech selloff. I understand but still painful with LRCX. Will take many weeks to get before june 9'

In [23]:
twits[-3]['body']

'$RIO misses on earnings, under investigation for fraudulent business behavior and we gap up to 52 wk high? Lmao short this POS'

In [24]:
#List of positive coefficients 
sentiment_model['coefficients'].sort('value', ascending=False).print_rows(num_rows=100)

+------------+--------------------------+---------+---------------+--------+
|    name    |          index           |  class  |     value     | stderr |
+------------+--------------------------+---------+---------------+--------+
| word_count |       previously,        | Bullish | 23.7832550364 |  None  |
| word_count |        this.....         | Bullish | 22.6907712729 |  None  |
| word_count |          chaps!          | Bullish | 19.0970342535 |  None  |
| word_count |          price:          | Bullish | 16.1137038214 |  None  |
| word_count |           otm            | Bullish | 15.4890290972 |  None  |
| word_count |          assume          | Bullish | 15.1786835995 |  None  |
| word_count |         breaks.          | Bullish | 14.5724305951 |  None  |
| word_count |           moon           | Bullish | 13.3346974578 |  None  |
| word_count |          asked           | Bullish | 13.0524817306 |  None  |
| word_count |          queues          | Bullish | 12.5021495496 |  None  |

In [25]:
sentiment_model['coefficients'].sort('value', ascending=True).print_rows(num_rows=100)

+------------+----------------------------+---------+----------------+--------+
|    name    |           index            |  class  |     value      | stderr |
+------------+----------------------------+---------+----------------+--------+
| word_count |         today....          | Bullish | -22.6080938591 |  None  |
| word_count |           puts!            | Bullish | -21.9632710639 |  None  |
| word_count |          (shorts)          | Bullish | -19.4759414035 |  None  |
| word_count |         accenture          | Bullish | -19.2602712699 |  None  |
| word_count |           aprox            | Bullish | -19.124431912  |  None  |
| word_count |            cvs             | Bullish | -18.7999192704 |  None  |
| word_count |          burning           | Bullish | -18.403781807  |  None  |
| word_count |         sideways.          | Bullish | -18.4019551156 |  None  |
| word_count |           risen            | Bullish | -17.4817210978 |  None  |
| word_count |           blame          