# Sentiment classification of YT comments/ tweets/ reviews

In [1]:
import pandas as pd
import regex as re
import unidecode

In [2]:
from textblob import TextBlob
import nltk
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Loading raw data


In [None]:
# did these separately, so wrote the code as such
# comment those not needed

# for yt-data
yt_data = pd.read_csv('raw_data/yt-comment-data2.csv')
yt_data.drop('id',axis=1,inplace=True)
yt_data.rename(columns={'comments': 'raw_content'}, inplace=True)
yt_data['source'] = 'yt'

# for twitter data
tw_data = pd.read_csv('raw_data/twitter.csv')
tw_data.drop(columns=['timestamp','query'],axis=1,inplace=True)
tw_data = tw_data[(tw_data['twitterProfile'] != 'https://twitter.com/henry_education')
 & (tw_data['twitterProfile'] != 'https://twitter.com/henryharvin_in')]
tw_data.rename(columns={'content': 'raw_content'}, inplace=True)
tw_data['source'] = 'tw'

# for trust pilot data
tp_data = pd.read_csv('raw_data/trust-pilot.csv')
tp_data['content'] = tp_data['Content1'].fillna(tp_data['Content'])
tp_data['info'] = tp_data['Info1'].fillna(tp_data['Info'])
tp_data = tp_data[['info', 'content']]
tp_data.rename(columns={'content': 'raw_content'}, inplace=True)
tp_data['source'] = 'tp'

#data = yt_data.copy() # replace for separate classification
#data

In [None]:
# or concat to classify together
data = pd.concat([yt_data[['raw_content', 'source']],
                  tw_data[['raw_content', 'source']],
                  tp_data[['raw_content', 'source']]])
#data

Unnamed: 0,raw_content,source
0,First like bro !!! Let&#39;s gooo ! Python for...,yt
1,"4kt\r<br><a href=""http://vun.fyi/"">vun.fyi</a>",yt
2,"<a href=""http://vog.fyi/"">vog.fyi\r</a><br>del...",yt
3,"v22\r<br><a href=""http://vun.fyi/"">vun.fyi</a>",yt
4,"qzp\r<br><a href=""http://vun.fyi/"">vun.fyi</a>",yt
...,...,...
212,Thier attitude chnages once you pay the fees a...,tp
213,Henry Harvin offers 2 courses - Business Analy...,tp
214,Worked there as an Intern for a month.Pros: Ni...,tp
215,I really loved Henry Harvin's Content Writing ...,tp


## Cleaning data:

In [None]:
def clean_text(text):
    
    #lower-case
    text=text.lower()
    
    #remove handles, url's
    text= re.sub(r'@\S+', '',text)
    text= re.sub(r'http\S+', '',text) 
    text= re.sub(r'pic.\S+', '',text)
    
    # removing #tags 
    #text= re.sub(r'#\S+', '',text)
    
    # replace unidecode characters
    text=unidecode.unidecode(text)
      
    # regex only keeps characters
    text= re.sub(r"[^a-zA-Z+']", ' ',text)
    
    #removing 'br' 'href' html
    text= re.sub(r'br', '',text)
    text= re.sub(r'href', '',text)
    
    # keep words with length>1 only
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ')
    
    # regex removes repeated spaces, strip removes leading and trailing spaces
    text= re.sub(r"\s[\s]+", " ",text).strip()
    
    # keep words with length>1 only
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ')
    
    return text

In [None]:
data.drop_duplicates(inplace=True)
data['content'] = data['raw_content'].apply(lambda x: clean_text(x))
#data

Unnamed: 0,raw_content,source,content
0,First like bro !!! Let&#39;s gooo ! Python for...,yt,first like let gooo python for life
1,"4kt\r<br><a href=""http://vun.fyi/"">vun.fyi</a>",yt,kt
2,"<a href=""http://vog.fyi/"">vog.fyi\r</a><br>del...",yt,a delightful
3,"v22\r<br><a href=""http://vun.fyi/"">vun.fyi</a>",yt,v
4,"qzp\r<br><a href=""http://vun.fyi/"">vun.fyi</a>",yt,qzp
...,...,...,...
164,Henry Harvin offers 2 courses - Business Analy...,tp,henry harvin offers courses business analytics...
165,Worked there as an Intern for a month.Pros: Ni...,tp,worked there as an intern for month pros nice ...
166,I really loved Henry Harvin's Content Writing ...,tp,i really loved henry harvin's content writing ...
167,"These guys are very unprofessional, don't have...",tp,these guys are very unprofessional don't have ...


In [None]:
# final cleaning on content

# stripping spaces
data['content'] = data['content'].apply(lambda x: x.strip())
# only taking text with greater than 3 letters after cleaning
data = data[data['content'].apply(lambda x: len(x) > 3)]
#data

Unnamed: 0,raw_content,source,content
0,First like bro !!! Let&#39;s gooo ! Python for...,yt,first like let gooo python for life
2,"<a href=""http://vog.fyi/"">vog.fyi\r</a><br>del...",yt,a delightful
5,can you give your word file link ?,yt,can you give your word file link
6,1st view love from nepal 🇳🇵,yt,st view love from nepal
7,What&#39;s fee of Complete digital marketing c...,yt,what fee of complete digital marketing course
...,...,...,...
164,Henry Harvin offers 2 courses - Business Analy...,tp,henry harvin offers courses business analytics...
165,Worked there as an Intern for a month.Pros: Ni...,tp,worked there as an intern for month pros nice ...
166,I really loved Henry Harvin's Content Writing ...,tp,i really loved henry harvin's content writing ...
167,"These guys are very unprofessional, don't have...",tp,these guys are very unprofessional don't have ...


## Sentiment Classification: 

### Polarity, Subjectivity & Sentiment using TextBlob

In [None]:
# function to get sentiment scores 
def sentiment_scores(text, pol=None, subj=None):
    
    if pol is None:
        pol = []
    if subj is None:
        sunj = []
        
    #using textblob for scores    
    sent = TextBlob(text)
    pol.append(sent.sentiment.polarity)
    subj.append(sent.sentiment.subjectivity)
    
    return pol, subj

In [None]:
# function to return polarity, subjectivity
def list_scores(lst):
    
    pol = []
    subj = []
    
    for i in lst:
        pol, subj = sentiment_scores(i, pol, subj)
        
    return pol, subj

polarity, subjectivity = list_scores(data['content'])

In [None]:
# dataframe with results of textblob analysis
sentiment_data = data[['source', 'content']].copy()
sentiment_data['tb_polarity'] = polarity
sentiment_data['tb_subjectivity'] = subjectivity
#sentiment_data

Unnamed: 0,source,content,tb_polarity,tb_subjectivity
0,yt,first like let gooo python for life,0.250000,0.333333
2,yt,a delightful,1.000000,1.000000
5,yt,can you give your word file link,0.000000,0.000000
6,yt,st view love from nepal,0.500000,0.600000
7,yt,what fee of complete digital marketing course,0.050000,0.200000
...,...,...,...,...
164,tp,henry harvin offers courses business analytics...,0.053017,0.406034
165,tp,worked there as an intern for month pros nice ...,0.471385,0.743506
166,tp,i really loved henry harvin's content writing ...,0.662500,0.725000
167,tp,these guys are very unprofessional don't have ...,-0.173333,0.360000


Using 'pol < 0' for negative results in a lot of false negative sentiments <br>
Use a custom threshold if possible <br>
Edit: later comparing with other sentiment models

In [None]:
# assign sentiment based on polarity
def decode_sentiment(pol,neg_th=0,pos_th=0):
    if pol < neg_th:
        return 'Negative'
    elif pol > pos_th:
        return 'Positive'
    return 'Neutral'

In [None]:
sentiment_data['tb_sentiment'] = sentiment_data['tb_polarity'].apply(lambda x: decode_sentiment(x))
#sentiment_data

Unnamed: 0,source,content,tb_polarity,tb_subjectivity,tb_sentiment
0,yt,first like let gooo python for life,0.250000,0.333333,Positive
2,yt,a delightful,1.000000,1.000000,Positive
5,yt,can you give your word file link,0.000000,0.000000,Neutral
6,yt,st view love from nepal,0.500000,0.600000,Positive
7,yt,what fee of complete digital marketing course,0.050000,0.200000,Positive
...,...,...,...,...,...
164,tp,henry harvin offers courses business analytics...,0.053017,0.406034,Positive
165,tp,worked there as an intern for month pros nice ...,0.471385,0.743506,Positive
166,tp,i really loved henry harvin's content writing ...,0.662500,0.725000,Positive
167,tp,these guys are very unprofessional don't have ...,-0.173333,0.360000,Negative


In [None]:
# checking negative sentiments
#sentiment_data[sentiment_data['tb_sentiment']=='Negative']

Unnamed: 0,source,content,tb_polarity,tb_subjectivity,tb_sentiment
27,yt,some of us enrolled but could not be regular i...,-0.03125,0.204647,Negative
165,yt,almost completed six sixma green belt in july ...,-0.555,0.583333,Negative
421,yt,thank henry harvin for valuable information ho...,-0.2,0.3,Negative
572,yt,thank you for valuable information your review...,-0.2,0.3,Negative
588,yt,the techniques are corner stones to mould the ...,-0.230769,0.461538,Negative
747,yt,too bad course,-0.7,0.666667,Negative
775,yt,is job assistance only limited to those who ar...,-0.035714,0.571429,Negative
811,yt,what does the website mean by job oriented out...,-0.3125,0.6875,Negative
814,yt,what happens if am unable to complete project,-0.2,0.45,Negative
834,yt,can you tell me when the next session for six ...,-0.1,0.15,Negative


### Sentiment classification using NLTK:

In [None]:
#nltk.download('opinion_lexicon') if not done already

# return sentiment of comment based on sentiment of each token
def nltk_sentiment(sentence):
    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent)))
    y = []

    # getting sentiment of each token using 
    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

    # returns sentiment based on count of pos_words/neg_words
    if pos_words > neg_words:
        return "Positive"
    elif pos_words < neg_words:
        return "Negative"
    elif pos_words == neg_words:
        return "Neutral"

In [None]:
# return list of sentiments of all content
def list_nltk(lst):
    
    pol = []
    
    for i in lst:
        pol.append(nltk_sentiment(i))
        
    return pol

In [None]:
nltk_pol = list_nltk(data['content'])

In [None]:
# dataframe with all nltk analysis
nltk_data = data.copy()
nltk_data['sentiment'] = nltk_pol
#nltk_data

In [None]:
# checking negative sentiments
nltk_data[nltk_data['sentiment']=='Negative']

### Comparing sentiment results

In [None]:
# comparing results of textblob and nltk
compare_data = data.copy()
compare_data['tb_sentiment'] = sentiment_data['tb_sentiment']
compare_data['nltk_sentiment'] = nltk_data['sentiment']

#compare_data

In [None]:
compare_data['sentiment'] = compare_data['tb_sentiment']

In [None]:
# using comparison of textblob and nltk to choose best sentiment option
def tb_nltk_sentiment(df):
    
    # lists to iterate
    tb_sent = list(df['tb_sentiment'])
    nltk_sent = list(df['nltk_sentiment'])
    comments = list(df['content'])
    sentiments = []
    
    # these modifications are to classify questions as neutral
    questions = ['what', 'who', 'why', 'when', 'is', 'how', 'can']
    thanks = ['thank', 'thanks', 'thankyou', 'thanku']
    
    # sentiments
    for i in range(len(comments)):
        # giving questions 'Neutral' sentiment
        if comments[i].split(' ')[0] in questions:
            sentiments.append('Neutral')
        # giving comments of thanks 'Positive' sentiment
        elif comments[i].split(' ')[0] in thanks:
            sentiments.append('Positive')

        # sentiments based on comparison, fit to reduce errors
        elif tb_sent[i] == nltk_sent[i]:
            sentiments.append(nltk_sent[i])
        elif (tb_sent[i] == 'Negative'):
            if (nltk_sent[i] == 'Neutral'):
                sentiments.append('Negative')
            else:
                sentiments.append('Neutral')
        
        elif (nltk_sent[i] == 'Neutral'):
            sentiments.append('Positive') 
        else:
            sentiments.append(nltk_sent[i])
    
    # adding column to dataframe
    df['sentiment'] = sentiments
    
    return df

In [None]:
compare_data = tb_nltk_sentiment(compare_data)
#compare_data

In [None]:
# checking negative sentiment comments
#(compare_data[compare_data['sentiment'] == 'Negative'])

In [None]:
# combining results to sentiment_data dataframe
sentiment_data['nltk_sentiment'] = compare_data['nltk_sentiment']
sentiment_data['final_sentiment'] = compare_data['sentiment']

## Write to file

In [None]:
# write to csv file
sentiment_data.to_csv('output/sentiment.csv',index=False)