In [75]:
import numpy as np
import pandas as pd
from IPython.display import display
import itertools
import random
from collections import  Counter

df = pd.read_csv('data/A2_dataset_processed.csv', index_col=0)
tweets = df.NEW_TEXT.astype(str).tolist()
tweet_words = [tweet.split() for tweet in tweets]
word_count = Counter(itertools.chain(*tweet_words))
total_tokens = sum(word_count.values())
tweet_bigrams = list(itertools.chain.from_iterable([list(zip(tweet, tweet[1:])) for tweet in tweet_words]))
bigram_count = dict(Counter(tweet_bigrams))
words = sorted(list(set(" ".join(tweets).split(" "))))
word_prob = {word: word_count[word]/total_tokens  for word in words}


In [76]:
coc_mat = pd.DataFrame(0.0,index=words,columns=words)
for bigram in tweet_bigrams:
    coc_mat.loc[bigram[0],bigram[1]]+=1

prob_mat = coc_mat.div(coc_mat.sum(axis=1), axis=0).fillna(0)

Highest occuring bigrams

In [77]:
sorted(bigram_count.items(),key=lambda x: x[1],reverse=True)[:4]

[(('can', 'not'), 170),
 (('but', 'not'), 49),
 (('not', 'get'), 48),
 (('wake', 'up'), 44)]

Smoothing 

In [78]:
factor = 1*np.array(list(word_prob.values()))
coc_mat_ = coc_mat + factor
smooth_mat = coc_mat_.div(coc_mat_.add(factor).sum(axis=1), axis=0).fillna(0)
smooth_mat

Unnamed: 0,0,0130,03,030,05,0736,0822,09,093002,1,...,ñðµ,ñðµð,ñðµñðµñð,ñðð,ñððµ,ñððð,ñðñð,ññ,ññðð,ñññ
0,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000069,0.000017,0.000519,...,0.000017,0.000017,0.000017,0.000035,0.000017,0.000035,0.000017,0.000052,0.000017,0.000035
0130,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000046,0.000012,0.000346,...,0.000012,0.000012,0.000012,0.000023,0.000012,0.000023,0.000012,0.000035,0.000012,0.000023
03,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000046,0.000012,0.000346,...,0.000012,0.000012,0.000012,0.000023,0.000012,0.000023,0.000012,0.000035,0.000012,0.000023
030,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000046,0.000012,0.000346,...,0.000012,0.000012,0.000012,0.000023,0.000012,0.000023,0.000012,0.000035,0.000012,0.000023
05,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000046,0.000012,0.000346,...,0.000012,0.000012,0.000012,0.000023,0.000012,0.000023,0.000012,0.000035,0.000012,0.000023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ñððð,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000035,0.000009,0.000260,...,0.000009,0.000009,0.000009,0.000017,0.000009,0.000017,0.000009,0.000026,0.000009,0.000017
ñðñð,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000046,0.000012,0.000346,...,0.000012,0.000012,0.000012,0.000023,0.000012,0.000023,0.000012,0.000035,0.000012,0.000023
ññ,0.000007,0.000007,0.000007,0.000007,0.000007,0.000007,0.000007,0.000028,0.000007,0.000208,...,0.000007,0.000007,0.000007,0.000014,0.000007,0.000014,0.000007,0.000021,0.000007,0.000014
ññðð,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000046,0.000012,0.000346,...,0.000012,0.000012,0.000012,0.000023,0.000012,0.000023,0.000012,0.000035,0.000012,0.000023


Sentiment Addition

In [178]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

def get_sentiment_score(text,scaling_factor=100,sentiment='positive'):
    sid = SentimentIntensityAnalyzer()
    score = scaling_factor*sid.polarity_scores(text)['compound']
    if sentiment =='negative':
        score = -score
    
    if score>=0:
        return score+1
    return 1


def vader_class(sentence):
    sid = SentimentIntensityAnalyzer()
    scores= sid.polarity_scores(sentence)
    return int(scores['compound']>=0)
     

In [131]:
word_positive_sentiment = {word:get_sentiment_score(word) for word in words}
word_negative_sentiment = {word:get_sentiment_score(word,sentiment='negative') for word in words}

In [132]:
bigram_positive_sentiment = {bigram:get_sentiment_score(" ".join(bigram)) for bigram in tweet_bigrams}
bigram_negative_sentiment = {bigram:get_sentiment_score(" ".join(bigram),sentiment='negative') for bigram in tweet_bigrams}

In [138]:
[key for key in bigram_positive_sentiment.keys() if bigram_positive_sentiment[key]!=bigram_negative_sentiment[key]]

[('like', 'sedan'),
 ('sooooooooooo', 'tired'),
 ('tired', 'sparrow'),
 ('not', 'worry'),
 ('worry', 'vote'),
 ('col', 'love'),
 ('love', 'much'),
 ('ever', 'good'),
 ('good', 'love'),
 ('love', 'knitpicks'),
 ('sorry', 'not'),
 ('get', 'ready'),
 ('ready', 'church'),
 ('need', 'shake'),
 ('shake', 'gloomy'),
 ('gloomy', 'feeling'),
 ('feeling', 'maybe'),
 ('anything', 'fun'),
 ('cheer', 'follow'),
 ('yea', 'alone'),
 ('good', 'morning'),
 ('morning', 'bless'),
 ('bless', 'another'),
 ('another', 'share'),
 ('share', 'light'),
 ('light', 'love'),
 ('love', 'others'),
 ('well', 'thanks'),
 ('thanks', 'lol'),
 ('lol', 'want'),
 ('play', 'against'),
 ('nothing', 'like'),
 ('like', 'si'),
 ('not', 'good'),
 ('good', 'without'),
 ('lol', 'help'),
 ('help', 'assess'),
 ('hate', 'people'),
 ('sad', 'iranelection'),
 ('wah', 'glad'),
 ('glad', 'cut'),
 ('cut', 'run'),
 ('open', 'sentence'),
 ('sentence', 'describe'),
 ('answer', 'please'),
 ('please', 'try'),
 ('try', 'love'),
 ('love', 'music

In [146]:
beta1_positive = pd.DataFrame(1.0,index=words,columns=words)
for i,(bigram,sent_score) in enumerate(bigram_positive_sentiment.items()):
    beta1_positive.loc[bigram[0],bigram[1]]=sent_score

beta1_negative = pd.DataFrame(1.0,index=words,columns=words)
for i,(bigram,sent_score) in enumerate(bigram_negative_sentiment.items()):
    beta1_negative.loc[bigram[0],bigram[1]]=sent_score

In [149]:
beta2_positive = np.array(list(word_positive_sentiment.values()))
beta2_negative = np.array(list(word_negative_sentiment.values()))

In [150]:
positive_mat = beta1_positive*coc_mat + beta2_positive*np.array(list(word_prob.values()))
negative_mat = beta1_negative*coc_mat + beta2_negative*np.array(list(word_prob.values()))

In [151]:
prob_positive_mat = positive_mat.div(positive_mat.sum(axis=1), axis=0).fillna(0)
prob_negative_mat = negative_mat.div(negative_mat.sum(axis=1), axis=0).fillna(0)

In [152]:
def gen_sentence(n,smoothing=False,polarity='neutral'):
    if polarity =='positive':
        mat = prob_positive_mat
    elif polarity =='negative':
        mat = prob_negative_mat
    elif smoothing ==True:
        mat = smooth_mat
    else:
        mat = prob_mat
    word = random.choice(list(words))
    sentence = [word]
    for i in range(n-1):
        word = np.array(words)[np.random.multinomial(1, mat.loc[word])==1][0]
        sentence.append(word)
    return " ".join(sentence)

Generate 250 positive and 250 negative sentences

In [206]:
generated_data = pd.DataFrame(columns=['TEXT','LABEL'])
for i in range(250):
    sen = gen_sentence(7,polarity='positive')
    generated_data.loc[i,'TEXT'] = sen
    generated_data.loc[i,'LABEL'] = vader_class(sen)

for i in range(250):
    sen = gen_sentence(7,polarity='negative')
    generated_data.loc[i+250,'TEXT'] = sen
    generated_data.loc[i+250,'LABEL'] = vader_class(sen)

generated_data

Unnamed: 0,TEXT,LABEL
0,hood cheese lose much love goo goo,1
1,improvement ear disappear already glorious sun...,1
2,presentation care thanks dude ya love pour,1
3,info but true miss lol need help,1
4,benjerrys laugh daily love god damn like,1
...,...,...
495,bull faulty battery shit chemical get stop,0
496,kathy no bueno time man miss stupid,0
497,november 23rd emblem angry never beautiful place,0
498,bum hurt after attack migrant student week,0


In [185]:
og_data = pd.DataFrame(columns=['TEXT','LABEL'])
og_data['TEXT']=df['NEW_TEXT']
og_data['LABEL']=df['LABEL']

In [186]:
og_data

Unnamed: 0,TEXT,LABEL
0,get threaded scar,0
1,like sedan mango yesterday,1
2,car after shower bed sooooooooooo tired sparro...,1
3,actually start afternoon try something wed slo...,1
4,www gid not worry vote nonstop col love much,1
...,...,...
4282,performance clip test shock,1
4283,gh no rcn true blood episode demand 1013,0
4284,return forest sarah mercy lose key wood,1
4285,proud dad piece keep up papa,1


In [187]:
combined_data = pd.concat([og_data,generated_data],axis=0).sample(frac=1).reset_index(drop=True)

In [189]:
combined_data.to_csv('combined_data.csv')

In [191]:
generated_data.to_csv('generated_data.csv')

Perplexity calculation

In [192]:
def calculate_perplexity(sentence):
    words = sentence.split()
    bigrams = list(zip(words,words[1:]))
    prob = 1
    for bigram in bigrams:
        prob = prob*smooth_mat.loc[bigram[0],bigram[1]]
    
    return (1/prob)**(1/len(words))
    

In [207]:
average_perplexity = np.mean([calculate_perplexity(sentence) for sentence in generated_data.TEXT])
average_perplexity

62.21735592764733

Alternate Approaches

In [202]:
poor_positive_mat = coc_mat + beta2_positive*np.array(list(word_prob.values()))
poor_negative_mat = coc_mat + beta2_negative*np.array(list(word_prob.values()))


prob_poor_positive_mat = poor_positive_mat.div(poor_positive_mat.sum(axis=1), axis=0).fillna(0)
prob_poor_negative_mat = poor_negative_mat.div(poor_negative_mat.sum(axis=1), axis=0).fillna(0)

In [203]:
def gen_poor_sentence(n,smoothing=False,polarity='neutral'):
    if polarity =='positive':
        mat = prob_poor_positive_mat
    elif polarity =='negative':
        mat = prob_poor_negative_mat
    elif smoothing ==True:
        mat = smooth_mat
    else:
        mat = prob_mat
    word = random.choice(list(words))
    sentence = [word]
    for i in range(n-1):
        word = np.array(words)[np.random.multinomial(1, mat.loc[word])==1][0]
        sentence.append(word)
    return " ".join(sentence)

In [217]:
generated_data_poor = pd.DataFrame(columns=['TEXT','LABEL'])
for i in range(250):
    sen = gen_poor_sentence(7,polarity='positive')
    generated_data_poor.loc[i,'TEXT'] = sen
    generated_data_poor.loc[i,'LABEL'] = vader_class(sen)

for i in range(250):
    sen = gen_poor_sentence(7,polarity='negative')
    generated_data_poor.loc[i+250,'TEXT'] = sen
    generated_data_poor.loc[i+250,'LABEL'] = vader_class(sen)

generated_data_poor

Unnamed: 0,TEXT,LABEL
0,fear find perfect end need rave appreciate,1
1,daw dog die drug sky hope great,1
2,sword friend 2 sleep but hop try,1
3,jun wow not easily offend thanks take,1
4,case hopefully not up witty hope relaxing,1
...,...,...
495,artwork check diner chumsno poor little while,0
496,parliament bad bitch cordelia hate home monday,0
497,frankly sick get good timefollow fail since,0
498,money 55 1 athlete upset disappointed sad,0


In [220]:
average_perplexity = np.mean([calculate_perplexity(sentence) for sentence in generated_data_poor.TEXT])
average_perplexity

202.41238382633253

In [221]:
generated_data_poor.to_csv('generated_data_poor.csv')

In [222]:
combined_data_poor = pd.concat([og_data,generated_data_poor],axis=0).sample(frac=1).reset_index(drop=True)
combined_data_poor.to_csv('combined_data_poor.csv')