In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import warnings
warnings.simplefilter('ignore')
sns.set(rc={'figure.figsize' : (12, 6)})
sns.set_style("darkgrid", {'axes.grid' : True})

%matplotlib inline

In [2]:
tw_data0501 = pd.read_csv('twitter_data_2020-05-01_to_2020-05-09.csv', encoding = 'UTF-8')
tw_data0501.head()

Unnamed: 0,date,user_name,text
0,2020-05-09,digital_trans4m,2020 Current trends in IoT at Workplace Market...
1,2020-05-09,SoftClouds,#IoT Plays an Important Role in #Pharmaceutica...
2,2020-05-09,ajohnsocyber,Microsoft Shells Out $100K for IoT Security | ...
3,2020-05-09,KirkDBorne,With billions & billions of #IoT connected dev...
4,2020-05-09,MyTechMusings,Interesting discussion. No doubt #eSIM is the ...


In [3]:
tw_data0510 = pd.read_csv('twitter_data_2020-05-10_to_2020-05-20.csv', encoding = 'UTF-8')
tw_data0510.head()

Unnamed: 0,date,user_name,text
0,2020-05-20,IoTWorldSeries,You're invited to our next IoT World Virtual E...
1,2020-05-20,sigmaridge,Big Security in a Small Business World: 10 myt...
2,2020-05-20,ARNnet,Dicker Data expands VPN and IoT security solut...
3,2020-05-20,energyetc,"“Lighting, heating, cooling and ventilation, s..."
4,2020-05-20,sigmaridge,Education Now: Let’s Talk Distance Learning ht...


In [4]:
tw_data0521 = pd.read_csv('twitter_data_2020-05-21_to_2020-05-30.csv', encoding = 'UTF-8')
tw_data0521.head()

Unnamed: 0,date,user_name,text
0,2020-05-30,digital_trans4m,SC Awards Europe 2020 - Best IOT/IIOT Security...
1,2020-05-30,iotforall,Ignoring #Cybersecurity threats posed by insid...
2,2020-05-30,5GSec,How #5G Will Transform Economy and Society - h...
3,2020-05-30,k1rou,"Shodan founder John Matherly on IoT security, ..."
4,2020-05-30,RobTiffany,Getting Started with Azure IoT services: Secur...


In [5]:
tw_data0601 = pd.read_csv('twitter_data_2020-06-01_to_2020-06-10.csv', encoding = 'UTF-8')
tw_data0601.head()

Unnamed: 0,date,user_name,text
0,2020-06-10,digital_trans4m,All mobile operators vulnerable to denial of s...
1,2020-06-10,Intel_Jonathan,"Half of Enterprises Have No Dedicated Staff, P..."
2,2020-06-10,SuriyaSubraman,IoT Security Market How the Industry Will Witn...
3,2020-06-10,UK_CIF,Government to fund nine advanced security proj...
4,2020-06-10,paulinetaylor85,"Half of Enterprises Have No Dedicated Staff, P..."


In [6]:
q_data = pd.read_csv('quora_data.csv', encoding = 'latin')
q_data.rename(columns = {'comments' : 'text'}, inplace = True)
q_data.head()

Unnamed: 0,text
0,Research Study forecasts revenue from the glob...
1,Download Sample PDF to get More Information ab...
2,Government initiatives in smart cities projects
3,Here is a list of some root cause for vulnerab...
4,"According to New IDC Spending Guide, the world..."


In [7]:
tw_data = pd.concat([tw_data0501, tw_data0510, tw_data0521, tw_data0601])
tw_data.drop(['date', 'user_name'], axis='columns', inplace=True)
tw_data.head()

Unnamed: 0,text
0,2020 Current trends in IoT at Workplace Market...
1,#IoT Plays an Important Role in #Pharmaceutica...
2,Microsoft Shells Out $100K for IoT Security | ...
3,With billions & billions of #IoT connected dev...
4,Interesting discussion. No doubt #eSIM is the ...


In [8]:
total_data = pd.concat([tw_data, q_data])
total_data.head()

Unnamed: 0,text
0,2020 Current trends in IoT at Workplace Market...
1,#IoT Plays an Important Role in #Pharmaceutica...
2,Microsoft Shells Out $100K for IoT Security | ...
3,With billions & billions of #IoT connected dev...
4,Interesting discussion. No doubt #eSIM is the ...


In [9]:
t_data = pd.DataFrame(columns = ['review','sentiment'])
t_data['review'] = total_data['text']
t_data['sentiment'] = 0
t_data.head()

Unnamed: 0,review,sentiment
0,2020 Current trends in IoT at Workplace Market...,0
1,#IoT Plays an Important Role in #Pharmaceutica...,0
2,Microsoft Shells Out $100K for IoT Security | ...,0
3,With billions & billions of #IoT connected dev...,0
4,Interesting discussion. No doubt #eSIM is the ...,0


In [10]:
import string
import re


def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

t_data['review'] = t_data['review'].str.replace('<br />',' ')
t_data['review'] = t_data['review'].apply(lambda x:clean_text(x))
t_data.head()

Unnamed: 0,review,sentiment
0,current trends in iot at workplace market is ...,0
1,iot plays an important role in pharmaceutical ...,0
2,microsoft shells out for iot security threat...,0
3,with billions billions of iot connected devic...,0
4,interesting discussion no doubt esim is the fu...,0


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def ngram_vectorize(texts, labels):
    kwargs = {'ngram_range' : (1, 2),'dtype' : 'int32','strip_accents' : 'unicode',
              'decode_error' : 'replace','analyzer' : 'word','min_df' : 2}

    #tf-idf로 vectorize
    tfidf_vectorizer = TfidfVectorizer(**kwargs)
    transformed_texts = tfidf_vectorizer.fit_transform(texts)
    
    # selector를 이용하여, vector화된 값들을 최적화  
    selector = SelectKBest(f_classif, k=min(20000, transformed_texts.shape[1]))
    selector.fit(transformed_texts, labels)
    transformed_texts = selector.transform(transformed_texts).astype('float32')

    return transformed_texts

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

#벡터화
tfidf = TfidfVectorizer()
tr_texts = tfidf.fit_transform(t_data['review'])
vect_data = ngram_vectorize(t_data['review'], t_data['sentiment'])

X = vect_data.toarray()

#준비된 모델을 통한 결과 예측
model = keras.models.load_model('IMDB_model.h5')
result = model.predict(X)

In [13]:
result.shape

(12507, 1)

In [14]:
t_data.head(20)

Unnamed: 0,review,sentiment
0,current trends in iot at workplace market is ...,0
1,iot plays an important role in pharmaceutical ...,0
2,microsoft shells out for iot security threat...,0
3,with billions billions of iot connected devic...,0
4,interesting discussion no doubt esim is the fu...,0
5,detecting and mitigating iot breaches require ...,0
6,i blocked ads that is of my internet traffic...,0
7,kubernetes cisco container platform petascale...,0
8,sources microsoft to buy israelbased cyberx wh...,0
9,apimote for zigbee sniffing and transmission ...,0


In [19]:
output = pd.DataFrame(columns = ['review', 'sentiment'])
output['review'] = total_data['text']
output['sentiment'] = result
output.head(20)

Unnamed: 0,review,sentiment
0,2020 Current trends in IoT at Workplace Market...,0.161231
1,#IoT Plays an Important Role in #Pharmaceutica...,0.99601
2,Microsoft Shells Out $100K for IoT Security | ...,0.011516
3,With billions & billions of #IoT connected dev...,0.815295
4,Interesting discussion. No doubt #eSIM is the ...,0.287337
5,Detecting and Mitigating IoT Breaches Require ...,0.28587
6,I blocked 1568 ads. That is 0.73% of my intern...,0.963484
7,Kubernetes + Cisco Container Platform: Petasca...,0.736547
8,"Sources: Microsoft to buy Israel-based CyberX,...",0.000617
9,APIMOTE (FOR ZIGBEE SNIFFING AND TRANSMISSION)...,0.593814


In [20]:
output.to_csv('sentiment_result.csv', mode='w')