<a href="https://colab.research.google.com/github/TrevinWacker/NLP-practice/blob/main/nlp_TextGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import random
import warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter

from sqlalchemy import create_engine

!pip install markovify
import markovify



In [4]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'twitter_sentiment'

warnings.filterwarnings("ignore")

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))


twitter_df = pd.read_sql_query('SELECT * FROM twitter',con=engine)

engine.dispose()

In [5]:
twitter_df.head()
# We're looking at feature "text" and we need to remove all of the @VirginAmerica at the beginning

Unnamed: 0,index,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [11]:
# Create spacy object to get text from DataFrame
nlp = spacy.load('en', disable=['parser', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# This allows for more characters to be stored than what was initially allowed(?)
nlp.max_length = 20000000

In [38]:
twitter_df["airline"].unique()

array(['Virgin America', 'United', 'American', 'Southwest', 'Delta',
       'US Airways'], dtype=object)

In [75]:
# Remove @ from all text threads; we already know the airline provider from the 'airline' feature
for i in range(twitter_df.shape[0]):
  twitter_df.loc[i,"text"] = re.sub(r'\@VirginAmerica',"",twitter_df.loc[i,"text"])
  twitter_df.loc[i,"text"] = re.sub(r'\@AmericanAir',"",twitter_df.loc[i,"text"])
  twitter_df.loc[i,"text"] = re.sub(r'\@united',"",twitter_df.loc[i,"text"])
  twitter_df.loc[i,"text"] = re.sub(r'\@SouthwestAir',"",twitter_df.loc[i,"text"])
  twitter_df.loc[i,"text"] = re.sub(r'\@JetBlue',"",twitter_df.loc[i,"text"])
  twitter_df.loc[i,"text"] = re.sub(r'\@USAirways',"",twitter_df.loc[i,"text"])

# Get quotes out of the text feature in the DataFrame
tweets = nlp(" ".join(twitter_df.text))

In [54]:
# Complete some initial analysis
print("'tweets' is a {} object.".format(type(tweets)))
print("It is {} tokens long".format(len(tweets)))
print("The first one hundred tokens are '{}'".format(tweets[:100]))
print("The type of each token is {}".format(type(tweets[0])))

'tweets' is a <class 'spacy.tokens.doc.Doc'> object.
It is 303253 tokens long
The first one hundred tokens are 'What @dhepburn said. plus you've added commercials to the experience... tacky. I didn't today... Must mean I need to take another trip! it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have little recourse and it's a really big bad thing about it seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA yes, nearly every time I fly VX this “ear worm'
The type of each token is <class 'spacy.tokens.token.Token'>


In [67]:
# Create new class that delivers new sentences following correct grammatical structures

class POSifiedText(markovify.Text):
    
    def word_split(self, sentence):
        return ["::".join((word.orth_, word.pos_)) for word in nlp(sentence)]

    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence

# Train a Markov chain model by using only the negative sentiment tweets. Generate some random sentences. Do the generated sentences exhibit the same negative sentiment?



In [62]:
# Get only negative tweets
negative_tweets = nlp(" ".join(twitter_df[twitter_df["airline_sentiment"] == 'negative'].text))

#Getting sentences with no punctuation or stopwords
negative_tweets = [sent.text for sent in negative_tweets.sents if len(sent.text) > 1]

negative_tweets[:4]

['it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse and it\'s a really big bad thing about it seriously would pay $30 a flight for seats that didn\'t have this playing.',
 "\nit's really the only bad thing about flying VA SFO-PDX schedule is still MIA.",
 " I flew from NYC to SFO last week and couldn't fully sit in my seat due to two large gentleman on either side of me.",
 'HELP!']

In [68]:
negative_sent_generator = POSifiedText(negative_tweets,state_size=3)

for i in range(5):
  print(negative_sent_generator.make_sentence())
  print()

None

Every flight with you guys was the way to go   DTV does nt work , pilots Late Flight , div to phx & amp ; I 'm gon na ignore the fasten seatbelt sign and I want a confirmation !

Despite all our efforts you did zero to keep her safe ..... alone on a flight tomorrow even though I had 1st class seat orig I have a simple question and the phone line does not even let me stay on hold ?

Very frustrating and the gate is missing .

I ca n't really afford another 4 hours now and still on tarmac .



The generated tweets aren't very coherent but they do show a negative sentiment.

#Do the same generation but for all positive tweets

In [69]:
# Get only positive tweets
positive_tweets = nlp(" ".join(twitter_df[twitter_df["airline_sentiment"] == 'positive'].text))

#Getting sentences with no punctuation or stopwords
positive_tweets = [sent.text for sent in positive_tweets.sents if len(sent.text) > 1]

positive_tweets[:4]

["plus you've added commercials to the experience... tacky.",
 "yes, nearly every time I fly VX this “ear worm” won’t go away :) @virginamerica Well, I didn't…but NOW I DO!",
 ':-D it was amazing, and arrived an hour early.',
 "You're too good to me."]

In [73]:
positive_sent_generator = POSifiedText(positive_tweets,state_size=2)

for i in range(5):
  print(positive_sent_generator.make_sentence(tries=50))
  print()

everything 's good to hear my bag not making it to San Antonio and your Twitter feed is clearly extremely useful .

Love the concept of # DivadaPouch aka # ThePoopQueen http://t.co/XXY2d2iMnP   Once again , safety first ! !

Already thinking about my 2nd trip to the T , realtime appreciation from JetBlue # rockingthetweets # JVMChat   @JayVig @roxydigital awww ^_^ R to the captain and crew !

I look forward to my global first class on that flight by gate agent Jan L at Phoenix was at least !

destinationdragons   that 's great .



These tweets have an obvious general sentiment.  Also, adjusting the state_size to 2 produced more coherent tweets; this may be due to the compressed nature of tweets requiring more concise language so having only immediate surrounding words affecting text generation may be more helpful.

# Train on all tweets.

---

In [None]:
tweets = [sent.text for sent in tweets.sents if len(sent.text) > 1]

In [78]:
alltweets_sent_generator = POSifiedText(tweets,state_size=3)

for i in range(5):
  print(alltweets_sent_generator.make_sentence())
  print()

to Increase Charter Service to Cuba - # Travel Agent http://t.co/lYQrb4HCYU   you guys should re - read it .

Need to be in California 5hrs ago   I need to change a flight   Delayed more and more these days .

So when was I supposed to watch Scandal ?

Hopefully the bag is , but we have WiFi – just saying .

I am seriously tired of being treated this way particularly when I was sound asleep .



These all lean negative, which would either indicate there are more negative tweets in the entire dataset so we're more likely to see something negative, a lot of the data is mislabeled and we really just have a lot of negative tweets, or even positive tweets have a lot of negative/ambiguous language within them.