# Sentiment classification of YT comments/ tweets/ reviews

In [1]:
import pandas as pd
import regex as re
import unidecode

In [2]:
from textblob import TextBlob
import nltk
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Loading raw data


In [4]:
# did these separately, so wrote the code as such
# comment those not needed

# for yt-data
yt_data = pd.read_csv('raw_data/yt-comment-data2.csv')
yt_data.drop('id',axis=1,inplace=True)
yt_data.rename(columns={'comments': 'raw_content'}, inplace=True)
yt_data['source'] = 'yt'

# for twitter data
tw_data = pd.read_csv('raw_data/twitter.csv')
tw_data.drop(columns=['timestamp','query'],axis=1,inplace=True)
tw_data = tw_data[(tw_data['twitterProfile'] != 'https://twitter.com/henry_education')
 & (tw_data['twitterProfile'] != 'https://twitter.com/henryharvin_in')]
tw_data.rename(columns={'content': 'raw_content'}, inplace=True)
tw_data['source'] = 'tw'

# for trust pilot data
tp_data = pd.read_csv('raw_data/trust-pilot.csv')
tp_data['content'] = tp_data['Content1'].fillna(tp_data['Content'])
tp_data['info'] = tp_data['Info1'].fillna(tp_data['Info'])
tp_data = tp_data[['info', 'content']]
tp_data.rename(columns={'content': 'raw_content'}, inplace=True)
tp_data['source'] = 'tp'

#data = yt_data.copy() # replace for separate classification
#data

In [5]:
# or concat to classify together
data = pd.concat([yt_data[['raw_content', 'source']],
                  tw_data[['raw_content', 'source']],
                  tp_data[['raw_content', 'source']]])
#data

## Cleaning data:

In [6]:
def clean_text(text):
    
    #lower-case
    text=text.lower()
    
    #remove handles, url's
    text= re.sub(r'@\S+', '',text)
    text= re.sub(r'http\S+', '',text) 
    text= re.sub(r'pic.\S+', '',text)
    
    # removing #tags 
    #text= re.sub(r'#\S+', '',text)
    
    # replace unidecode characters
    text=unidecode.unidecode(text)
      
    # regex only keeps characters
    text= re.sub(r"[^a-zA-Z+']", ' ',text)
    
    #removing 'br' 'href' html
    text= re.sub(r'br', '',text)
    text= re.sub(r'href', '',text)
    
    # keep words with length>1 only
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ')
    
    # regex removes repeated spaces, strip removes leading and trailing spaces
    text= re.sub(r"\s[\s]+", " ",text).strip()
    
    # keep words with length>1 only
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ')
    
    return text

In [7]:
data.drop_duplicates(inplace=True)
data['content'] = data['raw_content'].apply(lambda x: clean_text(x))
#data

In [8]:
# final cleaning on content

# stripping spaces
data['content'] = data['content'].apply(lambda x: x.strip())
# only taking text with greater than 3 letters after cleaning
data = data[data['content'].apply(lambda x: len(x) > 3)]
#data

## Sentiment Classification: 

### Polarity, Subjectivity & Sentiment using TextBlob

In [9]:
# function to get sentiment scores 
def sentiment_scores(text, pol=None, subj=None):
    
    if pol is None:
        pol = []
    if subj is None:
        sunj = []
        
    #using textblob for scores    
    sent = TextBlob(text)
    pol.append(sent.sentiment.polarity)
    subj.append(sent.sentiment.subjectivity)
    
    return pol, subj

In [10]:
# function to return polarity, subjectivity
def list_scores(lst):
    
    pol = []
    subj = []
    
    for i in lst:
        pol, subj = sentiment_scores(i, pol, subj)
        
    return pol, subj

polarity, subjectivity = list_scores(data['content'])

In [11]:
# dataframe with results of textblob analysis
sentiment_data = data[['source', 'content']].copy()
sentiment_data['tb_polarity'] = polarity
sentiment_data['tb_subjectivity'] = subjectivity
#sentiment_data

Using 'pol < 0' for negative results in a lot of false negative sentiments <br>
Use a custom threshold if possible <br>
Edit: later comparing with other sentiment models

In [12]:
# assign sentiment based on polarity
def decode_sentiment(pol,neg_th=0,pos_th=0):
    if pol < neg_th:
        return 'Negative'
    elif pol > pos_th:
        return 'Positive'
    return 'Neutral'

In [13]:
sentiment_data['tb_sentiment'] = sentiment_data['tb_polarity'].apply(lambda x: decode_sentiment(x))
#sentiment_data

In [14]:
# checking negative sentiments
#sentiment_data[sentiment_data['tb_sentiment']=='Negative']

### Sentiment classification using NLTK:

In [15]:
#nltk.download('opinion_lexicon') if not done already

# return sentiment of comment based on sentiment of each token
def nltk_sentiment(sentence):
    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent)))
    y = []

    # getting sentiment of each token using 
    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

    # returns sentiment based on count of pos_words/neg_words
    if pos_words > neg_words:
        return "Positive"
    elif pos_words < neg_words:
        return "Negative"
    elif pos_words == neg_words:
        return "Neutral"

In [16]:
# return list of sentiments of all content
def list_nltk(lst):
    
    pol = []
    
    for i in lst:
        pol.append(nltk_sentiment(i))
        
    return pol

In [17]:
nltk_pol = list_nltk(data['content'])

In [18]:
# dataframe with all nltk analysis
nltk_data = data.copy()
nltk_data['sentiment'] = nltk_pol
#nltk_data

In [27]:
# checking negative sentiments
#nltk_data[nltk_data['sentiment']=='Negative']

### Comparing sentiment results

In [28]:
# comparing results of textblob and nltk
compare_data = data.copy()
compare_data['tb_sentiment'] = sentiment_data['tb_sentiment']
compare_data['nltk_sentiment'] = nltk_data['sentiment']

#compare_data

In [29]:
compare_data['sentiment'] = compare_data['tb_sentiment']

In [30]:
# using comparison of textblob and nltk to choose best sentiment option
def tb_nltk_sentiment(df):
    
    # lists to iterate
    tb_sent = list(df['tb_sentiment'])
    nltk_sent = list(df['nltk_sentiment'])
    comments = list(df['content'])
    sentiments = []
    
    # these modifications are to classify questions as neutral
    questions = ['what', 'who', 'why', 'when', 'is', 'how', 'can']
    thanks = ['thank', 'thanks', 'thankyou', 'thanku']
    
    # sentiments
    for i in range(len(comments)):
        # giving questions 'Neutral' sentiment
        if comments[i].split(' ')[0] in questions:
            sentiments.append('Neutral')
        # giving comments of thanks 'Positive' sentiment
        elif comments[i].split(' ')[0] in thanks:
            sentiments.append('Positive')

        # sentiments based on comparison, fit to reduce errors
        elif tb_sent[i] == nltk_sent[i]:
            sentiments.append(nltk_sent[i])
        elif (tb_sent[i] == 'Negative'):
            if (nltk_sent[i] == 'Neutral'):
                sentiments.append('Negative')
            else:
                sentiments.append('Neutral')
        
        elif (nltk_sent[i] == 'Neutral'):
            sentiments.append('Positive') 
        else:
            sentiments.append(nltk_sent[i])
    
    # adding column to dataframe
    df['sentiment'] = sentiments
    
    return df

In [31]:
compare_data = tb_nltk_sentiment(compare_data)
#compare_data

In [32]:
# checking negative sentiment comments
#(compare_data[compare_data['sentiment'] == 'Negative'])

In [33]:
# combining results to sentiment_data dataframe
sentiment_data['nltk_sentiment'] = compare_data['nltk_sentiment']
sentiment_data['final_sentiment'] = compare_data['sentiment']

## Write to file

In [34]:
# write to csv file
sentiment_data.to_csv('output/sentiment.csv',index=False)