In [1]:
import graphlab
import re

In [2]:
# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1502453972.log


This non-commercial license of GraphLab Create for academic use is assigned to quentin.picard@gmail.com and will expire on July 18, 2018.


In [3]:
twits = graphlab.SFrame.read_csv('twits.csv', double_quote=False) #ignoring the parsing errors

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,int,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [4]:
twits.head(10)

User Name,body,created_at,id,sentiment,symbol
tactical,$AAUKF closing gap into model purchase zone on ...,2012-09-20T09:53:00Z,9604105,Bullish,AAUKF
Aaron Jackson,Most japanese ADRs are sitting near lows. V ...,2012-10-06T01:03:13Z,9858821,Bearish,MITSY
Ian,$SLW $RGLD $TRX Are Royalty Companies With ...,2013-06-27T00:42:21Z,14316534,Bullish,AAUKF
Market Impulses,$AXAHY moving towards the Objective after the ...,2014-04-08T09:30:12Z,21837298,Bearish,AXAHY
Marketmath,"$AAUKF AAL.L , seems to be forming a bearish ...",2014-08-04T11:05:10Z,25343226,Bearish,AAUKF
Michael Angelo Mustillo,Small bizjet orders picking up. ...,2014-08-12T15:02:18Z,25646970,Bullish,DASTY
etradebaby,$MTLS I think $12.82 would be a good spot to ...,2014-08-19T20:54:50Z,25917338,Bullish,DASTY
TraderX82,"$DASTY solid trend on the weekly, like the setup ...",2014-09-17T00:12:26Z,26864030,Bullish,DASTY
SpecialSits101,Would&#39;ve thought $FCX is the natural buyer for ...,2014-09-20T23:24:32Z,27028346,Bullish,REPYY
D.R.,$BNPQY $AIG $BCS $AXAHY $DB $SCGLY $OMVKY $EBKDY ...,2014-10-16T13:59:49Z,28059882,Bullish,AXAHY

username
TacticalQuant
ATMcharts
TheAmerican
Market_Impulses
marketmath
rocketPower
etradebaby
TraderX82
SpecialSits101
SFOscanner


In [5]:
len(twits)

21280

In [6]:
# remove English stopwords
twits['word_count'] = graphlab.text_analytics.count_words(twits['body']).dict_trim_by_keys(graphlab.text_analytics.stopwords(), True)


In [7]:
def clean_words(dico):
    for k, v in dico.items():
        if re.search('[0-9]', k)!=None or re.search('/', k)!=None or re.search('\$', k)!=None: #remove numbers/ stocks($) and http addresses
            dico.pop(k)
    return(dico)

In [8]:
twits['word_count'] = twits['word_count'].apply(clean_words)

In [9]:
twits.head(20)

User Name,body,created_at,id,sentiment,symbol
tactical,$AAUKF closing gap into model purchase zone on ...,2012-09-20T09:53:00Z,9604105,Bullish,AAUKF
Aaron Jackson,Most japanese ADRs are sitting near lows. V ...,2012-10-06T01:03:13Z,9858821,Bearish,MITSY
Ian,$SLW $RGLD $TRX Are Royalty Companies With ...,2013-06-27T00:42:21Z,14316534,Bullish,AAUKF
Market Impulses,$AXAHY moving towards the Objective after the ...,2014-04-08T09:30:12Z,21837298,Bearish,AXAHY
Marketmath,"$AAUKF AAL.L , seems to be forming a bearish ...",2014-08-04T11:05:10Z,25343226,Bearish,AAUKF
Michael Angelo Mustillo,Small bizjet orders picking up. ...,2014-08-12T15:02:18Z,25646970,Bullish,DASTY
etradebaby,$MTLS I think $12.82 would be a good spot to ...,2014-08-19T20:54:50Z,25917338,Bullish,DASTY
TraderX82,"$DASTY solid trend on the weekly, like the setup ...",2014-09-17T00:12:26Z,26864030,Bullish,DASTY
SpecialSits101,Would&#39;ve thought $FCX is the natural buyer for ...,2014-09-20T23:24:32Z,27028346,Bullish,REPYY
D.R.,$BNPQY $AIG $BCS $AXAHY $DB $SCGLY $OMVKY $EBKDY ...,2014-10-16T13:59:49Z,28059882,Bullish,AXAHY

username,word_count
TacticalQuant,"{'building': 1, 'purchase': 1, 'parti ..."
ATMcharts,"{'lows.': 1, 'trap?': 1, 'adrs': 1, 'japanese' ..."
TheAmerican,"{'form:': 1, 'royalty': 1, 'companies': 1, 'i ..."
Market_Impulses,"{'sell': 1, 'objective': 1, 'signal': 1, 'frid ..."
marketmath,"{'forming': 1, 'bearish': 1, ',': 1, 'scallop': 1, ..."
rocketPower,"{'up.': 1, 'small': 1, 'bizjet': 1, 'picking': ..."
etradebaby,"{'sell': 1, 'good': 1, 'spot': 1, 'short': 1, ..."
TraderX82,"{'sector': 1, 'weekly,': 1, 'trend': 1, ..."
SpecialSits101,"{'natural': 1, 'deep': 1, 'hearing': 1, 'thought': ..."
SFOscanner,"{'stockscan': 1, 'dividends': 1, ..."


In [26]:
graphlab.canvas.set_target('ipynb')
twits['sentiment'].show(view='Categorical')

In [11]:
train_data,test_data = twits.random_split(.8)

In [12]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data,
                                                     max_iterations=20)

In [13]:
graphlab.canvas.set_target('ipynb')

In [14]:
sentiment_model.evaluate(test_data, metric='roc_curve')

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+----------------+------+-----+
 | threshold |      fpr       |      tpr       |  p   |  n  |
 +-----------+----------------+----------------+------+-----+
 |    0.0    |      1.0       |      1.0       | 3226 | 990 |
 |   1e-05   | 0.905050505051 | 0.992250464972 | 3226 | 990 |
 |   2e-05   | 0.880808080808 | 0.990700557967 | 3226 | 990 |
 |   3e-05   | 0.871717171717 | 0.989770613763 | 3226 | 990 |
 |   4e-05   | 0.860606060606 | 0.98884066956  | 3226 | 990 |
 |   5e-05   | 0.855555555556 | 0.988220706758 | 3226 | 990 |
 |   6e-05   | 0.854545454545 | 0.987910725356 | 3226 | 990 |
 |   7e-05   | 0.851515151515 | 0.987910725356 | 3226 | 990 |
 |   8e-05   | 0.847474747475 | 0.987600743955 | 3226 | 990 |
 |   9e-05   | 0.845454545455 | 0.987290762554 | 3226 | 990 |
 +-----------+----------------+----------------+------+-----+
 [100001 rows x 5 columns]

In [15]:
sentiment_model.show(view='Evaluation')

In [16]:
twits['predicted_sentiment'] = sentiment_model.predict(twits, output_type='probability')

In [17]:
twits = twits.sort('predicted_sentiment', ascending=False)
twits.head()

User Name,body,created_at,id,sentiment,symbol
UseToBe4RingsPatsRule,$SGYP The usual games. A Positive article on SA ...,2017-08-04T17:20:50Z,91037914,Bullish,IRWD
Stockflare Alerts,$ON: ON Semiconductor Corp is now forecast to ...,2017-08-08T09:45:37Z,91291896,Bullish,ON
Kyle Becker,"$AMZN Consolidate, grind higher. Consolidate, ...",2017-08-03T15:22:02Z,90851803,Bullish,AMZN
Stockflare Alerts,$EDN: Empresa Distribuidora y Cmrz Nrt ...,2017-06-23T09:30:15Z,86835974,Bullish,EDN
REALALOY,$BBRY Game of Pwns: Security Lessons From ...,2017-08-08T02:20:46Z,91280330,Bullish,BBRY
AC,$AMD profit waves are ramping up &amp; the ...,2017-08-07T01:59:46Z,91162035,Bullish,AMD
Native Texan 777,"$BBRY filling up, crispy crunch getting blackb ...",2017-08-07T10:39:59Z,91172810,Bullish,BBRY
Fred Logan 2,$FB FB 📺 TV here! Next event on tap; 3:1 split ...,2017-08-11T07:57:25Z,91750928,Bullish,FB
brascano,$ENDP assume mesh payments end in 2019 and ...,2017-08-11T03:22:40Z,91742762,Bullish,ENDP
CorruptStreet,@upcnichol my belief. Makes perfect sense. ...,2017-08-04T20:24:53Z,91071652,Bullish,DIS

username,word_count,predicted_sentiment
5RingsPatsRule,"{'later,': 1, 'outlook!!': 1, 'quote': ...",1.0
stockflare_alerts,"{'previously,': 1, 'pay': 1, 'forecast': 1, ...",1.0
BossMoney,"{'consolidate,': 2, 'higher.': 2, 'pattern': ...",1.0
stockflare_alerts,"{'previously,': 1, 'empresa': 1, 'cmrz': 1, ...",1.0
REALALOY,"{'lessons': 1, 'hbo': 1, 'alex': 1, 'bb': 1, ...",1.0
ACTech,"{'intel': 1, 'waves': 3, 'profit': 1, 'say..': 1, ...",1.0
sharkyknows,"{'crunch': 1, 'up,': 1, 'shorts': 1, 'money': 1, ...",1.0
fredlogan2,"{'\xf0\x9f\x93\xba': 1, 'tv': 1, 'tap;': 1, ...",1.0
brascano,"{'product': 1, 'end': 1, 'revenue': 1, 'mm': 1, ...",1.0
CorruptStreet,"{'perfect': 1, 'belief.': 1, 'major': 1, ...",1.0


In [18]:
#Most positive review
twits[0]['body']

'$SGYP The usual games. A Positive article on SA yesterday, and of course, 24 hours later, a Quote More Realistic Outlook!! $IRWD is scared.'

In [19]:
twits[1]['body']

'$ON: ON Semiconductor Corp is now forecast to pay a dividend of $0.43. Previously, no dividend was forecast. (https://stockflare.com/stock/ON.O)'

In [20]:
twits[2]['body']

'$AMZN Consolidate, grind higher. Consolidate, grind higher. Easy pattern to spot. Will close green today.'

In [21]:
twits[-1]['body']

'$WFC gap insurance is a huge scam the dealership tells you it&#39;s mandatory to get the loan for subprime loans wfc  is a pos bank this is bad'

In [22]:
twits[-2]['body']

'$TSLA Tesla will pull a Trivago very soon. The hype is over, Musk will find excuses and labor turning Union. Strong Sell all day long.'

In [23]:
twits[-3]['body']

'$GDDY my sell short recommendation yesterday would have netted you +5% profit in one day. Sell short more $GDDY unsustainable neg EPS biz'

In [24]:
#List of positive coefficients 
sentiment_model['coefficients'].sort('value', ascending=False).print_rows(num_rows=100)

+------------+----------------+---------+---------------+--------+
|    name    |     index      |  class  |     value     | stderr |
+------------+----------------+---------+---------------+--------+
| word_count |  previously,   | Bullish |  37.533785801 |  None  |
| word_count |    rock...     | Bullish | 29.4571836963 |  None  |
| word_count |   careful..    | Bullish | 22.6552830374 |  None  |
| word_count |     games.     | Bullish | 22.0816562532 |  None  |
| word_count |   @wolfscout   | Bullish | 21.1788459232 |  None  |
| word_count |   happened?    | Bullish | 20.9390921485 |  None  |
| word_count |    freaked.    | Bullish | 19.4250209836 |  None  |
| word_count | inexperienced  | Bullish | 18.7634260641 |  None  |
| word_count |      dime      | Bullish | 17.4134512174 |  None  |
| word_count |    yielded     | Bullish | 17.2189691088 |  None  |
| word_count |     fear.      | Bullish | 17.1077630512 |  None  |
| word_count |    #squeeze    | Bullish | 17.0339232642 |  Non

In [25]:
sentiment_model['coefficients'].sort('value', ascending=True).print_rows(num_rows=100)

+------------+-----------------+---------+----------------+--------+
|    name    |      index      |  class  |     value      | stderr |
+------------+-----------------+---------+----------------+--------+
| word_count |      lame,      | Bullish | -27.7017559806 |  None  |
| word_count |      union.     | Bullish | -27.5428014442 |  None  |
| word_count |      break?     | Bullish | -22.6423887241 |  None  |
| word_count |      way...     | Bullish | -21.3185963409 |  None  |
| word_count |    dividend,    | Bullish | -20.2502619365 |  None  |
| word_count |     value!!     | Bullish | -20.0588181766 |  None  |
| word_count |       shlt      | Bullish | -18.709532094  |  None  |
| word_count |      hell.      | Bullish | -18.3426316165 |  None  |
| word_count |     yiiikes,    | Bullish | -18.3089121038 |  None  |
| word_count |      admit      | Bullish | -18.1633881891 |  None  |
| word_count |    dreaming.    | Bullish | -18.1274606311 |  None  |
| word_count |      dump!      | B