In [2]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
from textblob import TextBlob
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
train  = pd.read_csv('sorted.csv')

In [4]:
# remove url
def remove_urls(vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

train.content = train.content.apply(remove_urls)

In [5]:
# remove non-en language
train = train[~train['content'].str.contains(r'[^\x00-\x7F]+')]
train.head()

Unnamed: 0,datetime,content
0,2020-01-21 09:16:11+00:00,Shares in Chinese drug makers have risen sharp...
3,2020-01-21 12:46:37+00:00,"#Singapore levels up in #innovation rankings, ..."
4,2020-01-21 13:05:38+00:00,Paul Tudor Jones is very concerned about the #...
6,2020-01-21 14:19:27+00:00,Paul Tudor Jones says this stock market run re...
7,2020-01-21 14:33:46+00:00,#PageOne: Stock market afflicted by a virus #...


In [6]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt    

In [7]:
# remove twitter handles (@user)
train['tidy_tweet'] = np.vectorize(remove_pattern)(train['content'], "@[\w]*")

In [8]:
# remove special characters, numbers, punctuations
train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [9]:
# romove short words
train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [10]:
# take a look
train.head()

Unnamed: 0,datetime,content,tidy_tweet
0,2020-01-21 09:16:11+00:00,Shares in Chinese drug makers have risen sharp...,Shares Chinese drug makers have risen sharply ...
3,2020-01-21 12:46:37+00:00,"#Singapore levels up in #innovation rankings, ...",#Singapore levels #innovation rankings just Am...
4,2020-01-21 13:05:38+00:00,Paul Tudor Jones is very concerned about the #...,Paul Tudor Jones very concerned about #coronav...
6,2020-01-21 14:19:27+00:00,Paul Tudor Jones says this stock market run re...,Paul Tudor Jones says this stock market remind...
7,2020-01-21 14:33:46+00:00,#PageOne: Stock market afflicted by a virus #...,#PageOne Stock market afflicted virus #coronav...


In [11]:
text = train.tidy_tweet.iloc[0]

In [12]:
s = TextBlob(text)
print(s)
print(s.sentiment)

Shares Chinese drug makers have risen sharply concerns grow over #coronavirus spreading across country Several pharmaceutical giants Shanghai Shenzhen stock exchanges have risen their daily limit
Sentiment(polarity=-0.03125, subjectivity=0.1875)


In [13]:
def get_polarity(text):
    s = TextBlob(text)
    return s.sentiment.polarity

In [14]:
def get_subjectivity(text):
    s = TextBlob(text)
    return s.sentiment.subjectivity

In [15]:
train["polarity"] = train.tidy_tweet.apply(get_polarity)
train["subjectivity"] = train.tidy_tweet.apply(get_subjectivity)
train.head()

Unnamed: 0,datetime,content,tidy_tweet,polarity,subjectivity
0,2020-01-21 09:16:11+00:00,Shares in Chinese drug makers have risen sharp...,Shares Chinese drug makers have risen sharply ...,-0.03125,0.1875
3,2020-01-21 12:46:37+00:00,"#Singapore levels up in #innovation rankings, ...",#Singapore levels #innovation rankings just Am...,0.2,0.2
4,2020-01-21 13:05:38+00:00,Paul Tudor Jones is very concerned about the #...,Paul Tudor Jones very concerned about #coronav...,0.1,0.65
6,2020-01-21 14:19:27+00:00,Paul Tudor Jones says this stock market run re...,Paul Tudor Jones says this stock market remind...,0.1,0.3
7,2020-01-21 14:33:46+00:00,#PageOne: Stock market afflicted by a virus #...,#PageOne Stock market afflicted virus #coronav...,0.0,0.0


In [16]:
def get_date(text):
    s = text[0:10]
    return s

In [17]:
train["date"] = train.datetime.apply(get_date)
train.head()

Unnamed: 0,datetime,content,tidy_tweet,polarity,subjectivity,date
0,2020-01-21 09:16:11+00:00,Shares in Chinese drug makers have risen sharp...,Shares Chinese drug makers have risen sharply ...,-0.03125,0.1875,2020-01-21
3,2020-01-21 12:46:37+00:00,"#Singapore levels up in #innovation rankings, ...",#Singapore levels #innovation rankings just Am...,0.2,0.2,2020-01-21
4,2020-01-21 13:05:38+00:00,Paul Tudor Jones is very concerned about the #...,Paul Tudor Jones very concerned about #coronav...,0.1,0.65,2020-01-21
6,2020-01-21 14:19:27+00:00,Paul Tudor Jones says this stock market run re...,Paul Tudor Jones says this stock market remind...,0.1,0.3,2020-01-21
7,2020-01-21 14:33:46+00:00,#PageOne: Stock market afflicted by a virus #...,#PageOne Stock market afflicted virus #coronav...,0.0,0.0,2020-01-21


In [18]:
df=train.groupby("date").mean().rename(columns={"polarity":"polarity_mean","subjectivity":"subjectivity_mean"})
df

Unnamed: 0_level_0,polarity_mean,subjectivity_mean
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-21,0.057674,0.312639
2020-01-22,0.071875,0.298333
2020-01-23,0.015898,0.188463
2020-01-24,0.030325,0.296682
2020-01-25,0.019724,0.263192
2020-01-26,0.034738,0.306089
2020-01-27,0.05731,0.329718
2020-01-28,0.051276,0.338168
2020-01-29,0.039522,0.310778
2020-01-30,0.047264,0.30467


In [19]:
df.to_csv("data.csv")

In [20]:
pip install theano

Note: you may need to restart the kernel to use updated packages.


In [21]:
import pandas as pd
predict = pd.read_csv('data_sentiment.csv')

In [22]:
predict["date"] = predict.datetime.apply(get_date)
predict.head()

Unnamed: 0.1,Unnamed: 0,datetime,content,sentiment,date
0,0,2020-01-21 09:16:11+00:00,Shares in Chinese drug makers have risen sharp...,0,2020-01-21
1,1,2020-01-21 10:47:38+00:00,Wuhan’s viral outbreak knocks the stock market...,4,2020-01-21
2,2,2020-01-21 10:52:56+00:00,J'ai vu le film Contagion donc pour éradiquer ...,4,2020-01-21
3,3,2020-01-21 12:46:37+00:00,"#Singapore levels up in #innovation rankings, ...",4,2020-01-21
4,4,2020-01-21 13:05:38+00:00,Paul Tudor Jones is very concerned about the #...,4,2020-01-21


In [23]:
x=predict.groupby("date").mean().rename(columns={"sentiment":"sentiment_mean"})
x

Unnamed: 0_level_0,Unnamed: 0,sentiment_mean
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-21,13.5,2.571429
2020-01-22,45.5,1.444444
2020-01-23,96.5,2.060606
2020-01-24,155.5,1.846154
2020-01-25,218.5,1.783784
2020-01-26,314.5,1.627119
2020-01-27,495.0,2.057613
2020-01-28,729.5,1.663717
2020-01-29,928.5,1.372093
2020-01-30,1122.0,1.413953


In [29]:
x=("she stocks all kinds of toys")
y=TextBlob(x).sentiment.polarity
y

0.0