In [17]:
import json
import pandas as pd
import re
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
"""
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
"""
nltk.data.path.append('./nltk_data')

In [18]:
with open('result.json', encoding='utf8') as f:
    data = json.load(f)

In [19]:
def get_msg_df(df):
    msg_df = df[df['type']=='message'].filter(items = ['id','from','text','from_id']).reset_index(drop=True)
    tmp_msg_df = []
    for index, row in msg_df.iterrows():
        if type(row['text']) == list:
            s = ''
            for item in row['text']:
                if type(item) == str:
                    s += item
                elif item['type'] != 'link':
                    s += item['text']
            row['text'] = s
        s = ''
        for c in row['text']:
            c = c.lower()
            if c.isalpha() or c.isdigit() or c == "'":
                s += c
            elif c == '?':
                s += ' '+c+' '
            else:
                s += ' '
        row['text'] = ' '.join(s.split())
        tmp_msg_df.append(row)

    tmp_msg_df = pd.DataFrame(tmp_msg_df).reset_index(drop=True)
    msg_df['clean_text'] = tmp_msg_df['text']
    msg_df = msg_df[msg_df['clean_text'] != ''].reset_index(drop=True)
    return msg_df

In [20]:
def get2_msg_df(df):

    def preprocessing_msg_df(msg_df):
        new_msg_df = []
        for index, row in msg_df.iterrows():
            if type(row['text']) == list:
                s = ''
                for item in row['text']:
                    if type(item) == str:
                        s += item
                    else:
                        s += item['text']
                row['text'] = s
            if row['text'] != '':
                new_msg_df.append(row)
        new_msg_df = pd.DataFrame(new_msg_df).reset_index(drop=True)
        return new_msg_df

    def cleaning_msg_df(msg_df):
        msg_df['clean_text'] = msg_df['text'].str.lower()
        msg_df['clean_text'] = msg_df['clean_text'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http+?", "", elem))  
        return msg_df
    
    msg_df = df[df['type']=='message'].filter(items = ['id','from','text','from_id']).reset_index(drop=True)
    msg_df = preprocessing_msg_df(msg_df)
    msg_df = cleaning_msg_df(msg_df)
    msg_df = msg_df[msg_df['clean_text'] != ''].reset_index(drop=True)
    return msg_df

In [21]:
df = pd.DataFrame(data['messages'])
msg_df = get2_msg_df(df)

In [22]:
stop = stopwords.words('english')
msg_df['stopped_text'] = msg_df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [23]:
msg_df['token_text'] = msg_df['stopped_text'].apply(lambda x: word_tokenize(x))

In [24]:
def word_stemmer(text):
    stem_text = [PorterStemmer().stem(i) for i in text]
    return stem_text
msg_df['stem_text'] = msg_df['token_text'].apply(lambda x: word_stemmer(x))

In [25]:
def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text
msg_df['lemma_text'] = msg_df['token_text'].apply(lambda x: word_lemmatizer(x))

In [26]:
def word_pos_tagger(text):
    pos_tagged_text = nltk.pos_tag(text)
    return pos_tagged_text
msg_df['pos_text'] = msg_df['token_text'].apply(lambda x: word_pos_tagger(x))

In [27]:
msg_df['unstopped_token_text'] = msg_df['text'].apply(lambda x: word_tokenize(x.lower()))
msg_df['unstopped_pos_text'] = msg_df['unstopped_token_text'].apply(lambda x: word_pos_tagger(x))

In [28]:
msg_df['unstopped_blob_text'] = msg_df['text'].apply(lambda x: TextBlob(x).tags)

In [29]:
sid = SentimentIntensityAnalyzer()
d = {
  'compound': 0,
  'neg': 0,
  'neu': 0,
  'pos': 0
}
c=0
for _, row in msg_df.iterrows():
    sentence = row["stopped_text"]
    if sentence != '':
        c+=1
        ss = sid.polarity_scores(sentence)
        for k in ss:
            d[k] += ss[k]
      
for k in d:
    d[k] = d[k]/c

In [30]:
print(d)
pos = d['pos']*100/(d['pos']+d['neg'])
neg = 100-pos
print('pos: {}, neg: {}'.format(pos, neg))

{'compound': 0.17103038538467244, 'neg': 0.053342289213765756, 'neu': 0.7039266346372145, 'pos': 0.2393503103284715}
pos: 81.77532014912862, neg: 18.224679850871382


In [31]:
acc_df = msg_df.groupby('from_id', as_index=False).aggregate(list)

In [32]:
acc_df.to_csv('acc_df.csv')
msg_df.to_csv('msg_df.csv')