#Trump Tweet Analysis, Topic Modeling and Prediction of Energy Stock Prices

In [2]:
from pyspark.sql import SparkSession
import numpy as np

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

import pyspark
from pyspark.ml import feature, regression, Pipeline, classification, pipeline, evaluation
from pyspark.sql import functions as fn, Row
from pyspark.sql.functions import when, regexp_extract, col
from pyspark import sql
from pyspark.sql.functions import *

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorIndexer

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sys
import re

In [3]:
#Importing the tweet file and converting it to a Data frame

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

trumpTweet_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true').option("delimiter", ",").option("escape", "\"").load('/FileStore/tables/trumptweets_2-a866d.csv')

In [4]:
#Converting the dataframe in pandas

trumpTweet_copy_df = trumpTweet_df
trumpTweet_pandas = trumpTweet_copy_df.toPandas()

##Initial Data Cleaning

In [6]:
#Removing special characters
trumpTweet_pandas['content'] = trumpTweet_pandas['content'].apply(lambda x: re.split('https:\/\/*', str(x))[0])

In [7]:
#Replacing blanks with 'nan'
trumpTweet_pandas['content'].replace('', np.nan, inplace=True)

In [8]:
trumpTweet_pandas.dropna(subset=['content'], inplace=True)

In [9]:
trumpTweet_pandas['content'] = trumpTweet_pandas['content'].apply(lambda x: re.split('http:\/\/*', str(x))[0])

In [10]:
#Grouping the tweets according to specific dates as we would merge this dataframe later on with stock prices per date

trumpTweet_groupedDate = trumpTweet_pandas.groupby(['date'])['content'].apply('-'.join).reset_index()

In [11]:
display(trumpTweet_groupedDate)

date,content
1/1/18,"HAPPY NEW YEAR! We are MAKING AMERICA GREAT AGAIN, and much faster than anyone thought possible!-The United States has foolishly given Pakistan more than 33 billion dollars in aid over the last 15 years, and they have given us nothing but lies & deceit, thinking of our leaders as fools. They give safe haven to the terrorists we hunt in Afghanistan, with little help. No more!-Iran is failing at every level despite the terrible deal made with them by the Obama Administration. The great Iranian people have been repressed for many years. They are hungry for food & for freedom. Along with human rights, the wealth of Iran is being looted. TIME FOR CHANGE!-Will be leaving Florida for Washington (D.C.) today at 4:00 P.M. Much work to be done, but it will be a great New Year!"
1/1/19,"HAPPY NEW YEAR!pic.twitter.com/bHoPDPQ7G6-MEXICO IS PAYING FOR THE WALL through the many billions of dollars a year that the U.S.A. is saving through the new Trade Deal, the USMCA, that will replace the horrendous NAFTA Trade Deal, which has so badly hurt our Country. Mexico & Canada will also thrive - good for all!-The Democrats will probably submit a Bill, being cute as always, which gives everything away but gives NOTHING to Border Security, namely the Wall. You see, without the Wall there can be no Border Security - the Tech “stuff” is just, by comparison, meaningless bells & whistles...-...Remember this. Throughout the ages some things NEVER get better and NEVER change. You have Walls and you have Wheels. It was ALWAYS that way and it will ALWAYS be that way! Please explain to the Democrats that there can NEVER be a replacement for a good old fashioned WALL!-Dr. Sebastian Gorka, a very good and talented guy, has a great new book just out, “Why We Fight.” Lots of insight - Enjoy!-HAPPY NEW YEAR TO EVERYONE, INCLUDING THE HATERS AND THE FAKE NEWS MEDIA! 2019 WILL BE A FANTASTIC YEAR FOR THOSE NOT SUFFERING FROM TRUMP DERANGEMENT SYNDROME. JUST CALM DOWN AND ENJOY THE RIDE, GREAT THINGS ARE HAPPENING FOR OUR COUNTRY!-Happy New Year!-The Democrats, much as I suspected, have allocated no money for a new Wall. So imaginative! The problem is, without a Wall there can be no real Border Security - and our Country must finally have a Strong and Secure Southern Border!-“General” McChrystal got fired like a dog by Obama. Last assignment a total bust. Known for big, dumb mouth. Hillary lover!-One thing has now been proven. The Democrats do not care about Open Borders and all of the crime and drugs that Open Borders bring!-Congratulations to President @ JairBolsonaro who just made a great inauguration speech - the U.S.A. is with you!-Border Security and the Wall “thing” and Shutdown is not where Nancy Pelosi wanted to start her tenure as Speaker! Let’s make a deal?-Gas prices are low and expected to go down this year. This would be good!-Washington Examiner - “MAGA list: 205 ‘historic results’ help Trump make case for 2020 re-election.” True!"
1/1/20,"Get this straightened out, Governor @ GavinNewsom-Wonderful account of U.S. Embassy (Iraq) vs. the Benghazi disaster!-How is the Paris Accord doing? Don’t ask!-One of my greatest honors was to have gotten CHOICE approved for our great Veterans. Others have tried for decades, and failed!-Thank you to the @ dcexaminer Washington Examiner. The list is growing every day!-Thank you Steve. The greatest Witch Hunt in U.S. history!-Our fantastic First Lady!-HAPPY NEW YEAR!-pic.twitter.com/EVAEYD1AgV"
1/10/18,"Today, it was my great honor to sign a new Executive Order to ensure Veterans have the resources they need as they transition back to civilian life. We must ensure that our HEROES are given the care and support they so richly deserve! -As I made very clear today, our country needs the security of the Wall on the Southern Border, which must be part of any DACA approval.-Thank you @ GOPLeader Kevin McCarthy! Couldn’t agree w/you more. TOGETHER, we are # MAGA-. @ ICEgov HSI agents and ERO officers, on behalf of an entire Nation, THANK YOU for what you are doing 24/7/365 to keep fellow American’s SAFE. Everyone is so grateful! # LawEnforcementAppreciationDay President @ realDonaldTrump-It just shows everyone how broken and unfair our Court System is when the opposing side in a case (such as DACA) always runs to the 9th Circuit and almost always wins before being reversed by higher courts.-The fact that Sneaky Dianne Feinstein, who has on numerous occasions stated that collusion between Trump/Russia has not been found, would release testimony in such an underhanded and possibly illegal way, totally without authorization, is a disgrace. Must have tough Primary!-The single greatest Witch Hunt in American history continues. There was no collusion, everybody including the Dems knows there was no collusion, & yet on and on it goes. Russia & the world is laughing at the stupidity they are witnessing. Republicans should finally take control!-I want to thank my @ Cabinet for working tirelessly on behalf of our country. 2017 was a year of monumental achievement and we look forward to the year ahead. Together, we are delivering results and MAKING AMERICA GREAT AGAIN! -Today, it was my great honor to welcome Prime Minister Erna Solberg of Norway to the @ WhiteHouse - a great friend and ally of the United States! Joint press conference:"
1/10/19,"The Mainstream Media has NEVER been more dishonest than it is now. NBC and MSNBC are going Crazy. They report stories, purposely, the exact opposite of the facts. They are truly the Opposition Party working with the Dems. May even be worse than Fake News CNN, if that is possible!-Gave an OFF THE RECORD luncheon, somewhat of a White House tradition or custom, to network anchors yesterday - and they quickly leaked the contents of the meeting. Who would believe how bad it has gotten with the mainstream media, which has gone totally bonkers!-Cryin Chuck told his favorite lie when he used his standard sound bite that I “slammed the table & walked out of the room. He had a temper tantrum.” Because I knew he would say that, and after Nancy said no to proper Border Security, I politely said bye-bye and left, no slamming!-There is GREAT unity with the Republicans in the House and Senate, despite the Fake News Media working in overdrive to make the story look otherwise. The Opposition Party & the Dems know we must have Strong Border Security, but don’t want to give “Trump” another one of many wins!-“Great support for Border Security and the Wall.” @ foxandfriends Even greater than anyone would know! “Presidents supporters do not want him to cave.” @ SteveDoocy I won’t!-Getting ready to leave for the Great State of Texas! # MAGA-MAKE AMERICA GREAT AGAIN!-President Obama, thank you for your great support – I have been saying this all along!pic.twitter.com/L506g9Aq4z-Because of the Democrats intransigence on Border Security and the great importance of Safety for our Nation, I am respectfully cancelling my very important trip to Davos, Switzerland for the World Economic Forum. My warmest regards and apologies to the @ WEF!-From the Southern Border....pic.twitter.com/Vgsf5nEZUH"
1/10/20,"THANK YOU TOLEDO, OHIO!-Under my administration, we will NEVER make excuses for America’s enemies – we will never hesitate in defending American lives – and we will never stop working to defeat Radical Islamic Terrorism!pic.twitter.com/022PjwhHjs-After years of rebuilding OTHER nations, we are finally rebuilding OUR nation. We are finally putting AMERICA FIRST! # KAG2020pic.twitter.com/azKL54A6BU-Democrats are now the party of high taxes, high crime, open borders, late-term abortion, socialism, and blatant corruption. The Republican Party is the party of the American Worker, the American Family, and the American Dream! # KAG2020pic.twitter.com/05XRX2odxN-Wow! Thank you Greg. Hope I live up to your expectations.-Great interview this morning by @ foxandfriends with some of the fantastic people who attended the big Rally last night in Toledo, Ohio. Thank you. Such amazing energy!-I agree. Jovita will do a great job!-Thank you Mike!-We will get this done for our great Miners!-“11,000 points gained in the Dow in the 3 years since the Election of President Trump. Today it may hit 29,000. That has NEVER happened before in that time frame. That has added 12.8 Trillion Dollars to the VALUE of American Business.” @ Varneyco @ FoxNews The best is yet to come!-“I’ve been doing this for 40 years and I’ve never seen anything like this (Economy).” @ Varneyco @ foxandfriends-I love constantly proving them wrong. It’s easy!-She will go down as perhaps the least successful Speaker in U.S. History!"
1/11/18,"The United States needs the security of the Wall on the Southern Border, which must be part of any DACA approval. The safety and security of our country is #1!pic.twitter.com/4CFzQXb5aS-Cutting taxes and simplifying regulations makes America the place to invest! Great news as Toyota and Mazda announce they are bringing 4,000 JOBS and investing $1.6 BILLION in Alabama, helping to further grow our economy!pic.twitter.com/Kcg8IVH6iA-Good news: Toyota and Mazda announce giant new Huntsville, Alabama, plant which will produce over 300,000 cars and SUV’s a year and employ 4000 people. Companies are coming back to the U.S. in a very big way. Congratulations Alabama!-Disproven and paid for by Democrats “Dossier used to spy on Trump Campaign. Did FBI use Intel tool to influence the Election?” @ foxandfriends Did Dems or Clinton also pay Russians? Where are hidden and smashed DNC servers? Where are Crooked Hillary Emails? What a mess!-In new Quinnipiac Poll, 66% of people feel the economy is “Excellent or Good.” That is the highest number ever recorded by this poll.-“House votes on controversial FISA ACT today.” This is the act that may have been used, with the help of the discredited and phony Dossier, to so badly surveil and abuse the Trump Campaign by the previous administration and others?-“45 year low in illegal immigration this year.” @ foxandfriends-With that being said, I have personally directed the fix to the unmasking process since taking office and today’s vote is about foreign surveillance of foreign bad guys on foreign land. We need it! Get smart!-Great news, as a result of our TAX CUTS & JOBS ACT!pic.twitter.com/SLvhLxP3Jl"
1/11/19,"Dear Diary...-Will be interviewed at the Border by @ seanhannity on @ FoxNews tonight at 9:00. Enjoy!-We lose 300 Americans a week, 90% of which comes through the Southern Border. These numbers will be DRASTICALLY REDUCED if we have a Wall!-When I took the Oath of Office....pic.twitter.com/GDhIqteKpv-I often said during rallies, with little variation, that “Mexico will pay for the Wall.” We have just signed a great new Trade Deal with Mexico. It is Billions of Dollars a year better than the very bad NAFTA deal which it replaces. The difference pays for Wall many times over!-H1-B holders in the United States can rest assured that changes are soon coming which will bring both simplicity and certainty to your stay, including a potential path to citizenship. We want to encourage talented and highly skilled people to pursue career options in the U.S.-Humanitarian Crisis at our Southern Border. I just got back and it is a far worse situation than almost anyone would understand, an invasion! I have been there numerous times - The Democrats, Cryin’ Chuck and Nancy don’t know how bad and dangerous it is for our ENTIRE COUNTRY....-...The Steel Barrier, or Wall, should have been built by previous administrations long ago. They never got it done - I will. Without it, our Country cannot be safe. Criminals, Gangs, Human Traffickers, Drugs & so much other big trouble can easily pour in. It can be stopped cold!-The Fake News Media keeps saying we haven’t built any NEW WALL. Below is a section just completed on the Border. Anti-climbing feature included. Very high, strong and beautiful! Also, many miles already renovated and in service!pic.twitter.com/UAAGXl5Byr"
1/11/20,"Will be interviewed tonight by Laura @ IngrahamAngle at 10pmE on @ FoxNews. Enjoy!-“FBI Director apologizes for FISA Errors (of which there were far to many to be a coincidence!).” @ FoxNews Chris, what about all of the lives that were ruined because of the so-called “errors?” Are these “dirty cops” going to pay a big price for the fraud they committed?-Where have the Radical Left, Do Nothing Democrats gone when they have spent the last 3 days defending the life of Qassem Soleimani, one of the worst terrorists in history and the father of the roadside bomb? He was also looking to do big future damage! Dems are “unhinged.”-New polling shows that the totally partisan Impeachment Hoax is going nowhere. A vast majority want the Do Nothing Democrats to move on to other things now!-Nancy Pelosi will go down as the absolute worst Speaker of the House in U.S. history!-Now the Radical Left, Do Nothing Democrats, are asking @ senatemajldr Mitch McConnell to do the job that they were unable to do. They proved NOTHING but my total innocence in the House, despite the most unfair & biased hearings in the history of Congress. Now they demand fairness!-95% Approval Rating in the Republican Party, a record. 53% Approval Rating overall (can we add 7 to 10 percent because of the Trump “thing?”). Thank you!-The powerful Trump Wall is replacing porous, useless and ineffective barriers in the high traffic areas requested by Border Patrol. Illegal crossing are dropping as more and more Wall is being completed! # BuildingTheWallpic.twitter.com/2kdHNSMM04-To the brave, long-suffering people of Iran: I've stood with you since the beginning of my Presidency, and my Administration will continue to stand with you. We are following your protests closely, and are inspired by your courage.-به مردم شجاع و رنج کشیده ایران: من از ابتدای دوره ریاست جمهوریم با شما ایستاده‌ام و دولت من همچنان با شما خواهد ایستاد. ما اعتراضات شما را از نزدیک دنبال می کنیم. شجاعت شما الهام بخش است.-The government of Iran must allow human rights groups to monitor and report facts from the ground on the ongoing protests by the Iranian people. There can not be another massacre of peaceful protesters, nor an internet shutdown. The world is watching.-دولت ایران باید به گروه‌های حقوق بشر اجازه بدهد حقیقت کنونی اعتراضات در جریان مردم ایران را نظارت کرده و گزارش بدهند. نباید شاهد کشتار دوباره ی معترضان مسالمت آمیز و یا قطع اینترنت باشیم. جهان نظاره گر این اتفاقات است."
1/12/18,"Yesterday, I signed the # INTERDICTAct (H.R. 2142) with bipartisan members of Congress to help end the flow of drugs into our country. Together, we are committed to doing everything we can to combat the deadly scourge of drug addiction and overdose in the United States!pic.twitter.com/ELZvFol5Lo-Thank you Adam Levine, The Federalist, in interview on @ foxandfriends “Donald Trump is the greatest President our Country has ever seen.”-Small Business Poll has highest approval numbers in the polls history. All business is just at the beginning of something really special!-More great news as a result of historical Tax Cuts and Reform: Fiat Chrysler announces plan to invest more than $1 BILLION in Michigan plant, relocating their heavy-truck production from Mexico to Michigan, adding 2,500 new jobs and paying $2,000 bonus to U.S. employees!pic.twitter.com/47azKD0l9B-Chrysler is moving a massive plant from Mexico to Michigan, reversing a years long opposite trend. Thank you Chrysler, a very wise decision. The voters in Michigan are very happy they voted for Trump/Pence. Plenty of more to follow!-Democrat Dianne Feinstein should never have released secret committee testimony to the public without authorization. Very disrespectful to committee members and possibly illegal. She blamed her poor decision on the fact she had a cold - a first!-The Democrats seem intent on having people and drugs pour into our country from the Southern Border, risking thousands of lives in the process. It is my duty to protect the lives and safety of all Americans. We must build a Great Wall, think Merit and end Lottery & Chain. USA!-Reason I canceled my trip to London is that I am not a big fan of the Obama Administration having sold perhaps the best located and finest embassy in London for “peanuts,” only to build a new one in an off location for 1.2 billion dollars. Bad deal. Wanted me to cut ribbon-NO!-The so-called bipartisan DACA deal presented yesterday to myself and a group of Republican Senators and Congressmen was a big step backwards. Wall was not properly funded, Chain & Lottery were made worse and USA would be forced to take large numbers of people from high crime.....-....countries which are doing badly. I want a merit based system of immigration and people who will help take our country to the next level. I want safety and security for our people. I want to stop the massive inflow of drugs. I want to fund our military, not do a Dem defund....-....Because of the Democrats not being interested in life and safety, DACA has now taken a big step backwards. The Dems will threaten “shutdown,” but what they are really doing is shutting down our military, at a time we need it most. Get smart, MAKE AMERICA GREAT AGAIN!-The language used by me at the DACA meeting was tough, but this was not the language used. What was really tough was the outlandish proposal made - a big setback for DACA!-Sadly, Democrats want to stop paying our troops and government workers in order to give a sweetheart deal, not a fair deal, for DACA. Take care of our Military, and our Country, FIRST!-Never said anything derogatory about Haitians other than Haiti is, obviously, a very poor and troubled country. Never said “take them out.” Made up by Dems. I have a wonderful relationship with Haitians. Probably should record future meetings - unfortunately, no trust!-Today, it was my great honor to proclaim January 15, 2018, as Martin Luther King Jr., Federal Holiday. I encourage all Americans to observe this day with appropriate civic, community, and service activities in honor of Dr. King's life and legacy.pic.twitter.com/samlJsz1Nt"


In [12]:
import string

In [13]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

trumpTweet_groupedDate['cleaned_content'] = trumpTweet_groupedDate['content'].apply(lambda x: remove_punct(x))
trumpTweet_groupedDate.head(10)

Unnamed: 0,date,content,cleaned_content
0,1/1/18,HAPPY NEW YEAR! We are MAKING AMERICA GREAT AG...,HAPPY NEW YEAR We are MAKING AMERICA GREAT AGA...
1,1/1/19,HAPPY NEW YEAR!pic.twitter.com/bHoPDPQ7G6-MEXI...,HAPPY NEW YEARpictwittercombHoPDPQGMEXICO IS P...
2,1/1/20,"Get this straightened out, Governor @ GavinNew...",Get this straightened out Governor GavinNewso...
3,1/10/18,"Today, it was my great honor to sign a new Exe...",Today it was my great honor to sign a new Exec...
4,1/10/19,The Mainstream Media has NEVER been more disho...,The Mainstream Media has NEVER been more disho...
5,1/10/20,"THANK YOU TOLEDO, OHIO!-Under my administratio...",THANK YOU TOLEDO OHIOUnder my administration w...
6,1/11/18,The United States needs the security of the Wa...,The United States needs the security of the Wa...
7,1/11/19,Dear Diary...-Will be interviewed at the Borde...,Dear DiaryWill be interviewed at the Border by...
8,1/11/20,Will be interviewed tonight by Laura @ Ingraha...,Will be interviewed tonight by Laura Ingraham...
9,1/12/18,"Yesterday, I signed the # INTERDICTAct (H.R. 2...",Yesterday I signed the INTERDICTAct HR with ...


In [14]:
#display(trumpTweet_groupedDate)

In [15]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

trumpTweet_groupedDate['Tweet_tokenized'] = trumpTweet_groupedDate['cleaned_content'].apply(lambda x: tokenization(x.lower()))
#display(trumpTweet_groupedDate)

##Data Preprocessing (getting texts ready for topic modeling)

In [17]:

%sh 
pip install nltk
pip install --upgrade pip
python -m nltk.downloader all

In [18]:
import nltk

In [19]:
%sh pip install 
python -m spacy download en

####Removing Stopwords

In [21]:
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
trumpTweet_groupedDate['removed_stopwords'] = trumpTweet_groupedDate['Tweet_tokenized'].apply(lambda x: remove_stopwords(x))
#display(trumpTweet_groupedDate)

In [22]:

%sh 
pip install -U gensim


In [23]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

In [24]:
nltk.download('wordnet')

####Lemmatization

In [26]:
ps = nltk.PorterStemmer()
def lemmatize_stemming(text):
  word = WordNetLemmatizer().lemmatize(text, pos='v')
  #print(word)
  stemmed_word = nltk.PorterStemmer.stem(word)
  print(stemmed_word)
  return stemmed_word
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
      if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
        result.append(lemmatize_stemming(token))
        return result

In [27]:
#trumpTweet_groupedDate['preprocessed_text'] = trumpTweet_groupedDate['cleaned_content'].apply(lambda x: preprocess(x))

####Stemming

In [29]:
ps = nltk.snowball.EnglishStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

trumpTweet_groupedDate['stemmed_words'] = trumpTweet_groupedDate['removed_stopwords'].apply(lambda x: stemming(x))
trumpTweet_groupedDate.head()

Unnamed: 0,date,content,cleaned_content,Tweet_tokenized,removed_stopwords,stemmed_words
0,1/1/18,HAPPY NEW YEAR! We are MAKING AMERICA GREAT AG...,HAPPY NEW YEAR We are MAKING AMERICA GREAT AGA...,"[happy, new, year, we, are, making, america, g...","[happy, new, year, making, america, great, muc...","[happi, new, year, make, america, great, much,..."
1,1/1/19,HAPPY NEW YEAR!pic.twitter.com/bHoPDPQ7G6-MEXI...,HAPPY NEW YEARpictwittercombHoPDPQGMEXICO IS P...,"[happy, new, yearpictwittercombhopdpqgmexico, ...","[happy, new, yearpictwittercombhopdpqgmexico, ...","[happi, new, yearpictwittercombhopdpqgmexico, ..."
2,1/1/20,"Get this straightened out, Governor @ GavinNew...",Get this straightened out Governor GavinNewso...,"[get, this, straightened, out, governor, gavin...","[get, straightened, governor, gavinnewsomwonde...","[get, straighten, governor, gavinnewsomwond, a..."
3,1/10/18,"Today, it was my great honor to sign a new Exe...",Today it was my great honor to sign a new Exec...,"[today, it, was, my, great, honor, to, sign, a...","[today, great, honor, sign, new, executive, or...","[today, great, honor, sign, new, execut, order..."
4,1/10/19,The Mainstream Media has NEVER been more disho...,The Mainstream Media has NEVER been more disho...,"[the, mainstream, media, has, never, been, mor...","[mainstream, media, never, dishonest, nbc, msn...","[mainstream, media, never, dishonest, nbc, msn..."


In [30]:
wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

trumpTweet_groupedDate['Tweet_lemmatized'] = trumpTweet_groupedDate['removed_stopwords'].apply(lambda x: lemmatizer(x))
trumpTweet_groupedDate.head()

Unnamed: 0,date,content,cleaned_content,Tweet_tokenized,removed_stopwords,stemmed_words,Tweet_lemmatized
0,1/1/18,HAPPY NEW YEAR! We are MAKING AMERICA GREAT AG...,HAPPY NEW YEAR We are MAKING AMERICA GREAT AGA...,"[happy, new, year, we, are, making, america, g...","[happy, new, year, making, america, great, muc...","[happi, new, year, make, america, great, much,...","[happy, new, year, making, america, great, muc..."
1,1/1/19,HAPPY NEW YEAR!pic.twitter.com/bHoPDPQ7G6-MEXI...,HAPPY NEW YEARpictwittercombHoPDPQGMEXICO IS P...,"[happy, new, yearpictwittercombhopdpqgmexico, ...","[happy, new, yearpictwittercombhopdpqgmexico, ...","[happi, new, yearpictwittercombhopdpqgmexico, ...","[happy, new, yearpictwittercombhopdpqgmexico, ..."
2,1/1/20,"Get this straightened out, Governor @ GavinNew...",Get this straightened out Governor GavinNewso...,"[get, this, straightened, out, governor, gavin...","[get, straightened, governor, gavinnewsomwonde...","[get, straighten, governor, gavinnewsomwond, a...","[get, straightened, governor, gavinnewsomwonde..."
3,1/10/18,"Today, it was my great honor to sign a new Exe...",Today it was my great honor to sign a new Exec...,"[today, it, was, my, great, honor, to, sign, a...","[today, great, honor, sign, new, executive, or...","[today, great, honor, sign, new, execut, order...","[today, great, honor, sign, new, executive, or..."
4,1/10/19,The Mainstream Media has NEVER been more disho...,The Mainstream Media has NEVER been more disho...,"[the, mainstream, media, has, never, been, mor...","[mainstream, media, never, dishonest, nbc, msn...","[mainstream, media, never, dishonest, nbc, msn...","[mainstream, medium, never, dishonest, nbc, ms..."


In [31]:
trumpTweet_groupedDate['Tweet_lemmatized_stemmed'] = trumpTweet_groupedDate['stemmed_words'].apply(lambda x: lemmatizer(x))

In [32]:
trumpTweet_groupedDate.head()

Unnamed: 0,date,content,cleaned_content,Tweet_tokenized,removed_stopwords,stemmed_words,Tweet_lemmatized,Tweet_lemmatized_stemmed
0,1/1/18,HAPPY NEW YEAR! We are MAKING AMERICA GREAT AG...,HAPPY NEW YEAR We are MAKING AMERICA GREAT AGA...,"[happy, new, year, we, are, making, america, g...","[happy, new, year, making, america, great, muc...","[happi, new, year, make, america, great, much,...","[happy, new, year, making, america, great, muc...","[happi, new, year, make, america, great, much,..."
1,1/1/19,HAPPY NEW YEAR!pic.twitter.com/bHoPDPQ7G6-MEXI...,HAPPY NEW YEARpictwittercombHoPDPQGMEXICO IS P...,"[happy, new, yearpictwittercombhopdpqgmexico, ...","[happy, new, yearpictwittercombhopdpqgmexico, ...","[happi, new, yearpictwittercombhopdpqgmexico, ...","[happy, new, yearpictwittercombhopdpqgmexico, ...","[happi, new, yearpictwittercombhopdpqgmexico, ..."
2,1/1/20,"Get this straightened out, Governor @ GavinNew...",Get this straightened out Governor GavinNewso...,"[get, this, straightened, out, governor, gavin...","[get, straightened, governor, gavinnewsomwonde...","[get, straighten, governor, gavinnewsomwond, a...","[get, straightened, governor, gavinnewsomwonde...","[get, straighten, governor, gavinnewsomwond, a..."
3,1/10/18,"Today, it was my great honor to sign a new Exe...",Today it was my great honor to sign a new Exec...,"[today, it, was, my, great, honor, to, sign, a...","[today, great, honor, sign, new, executive, or...","[today, great, honor, sign, new, execut, order...","[today, great, honor, sign, new, executive, or...","[today, great, honor, sign, new, execut, order..."
4,1/10/19,The Mainstream Media has NEVER been more disho...,The Mainstream Media has NEVER been more disho...,"[the, mainstream, media, has, never, been, mor...","[mainstream, media, never, dishonest, nbc, msn...","[mainstream, media, never, dishonest, nbc, msn...","[mainstream, medium, never, dishonest, nbc, ms...","[mainstream, medium, never, dishonest, nbc, ms..."


In [33]:
#Keeping all processed words in a dictionary which shall be later used for creating bag of words

dictionary = gensim.corpora.Dictionary(trumpTweet_groupedDate['Tweet_lemmatized_stemmed'])


In [34]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [35]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

###Creating Bag of word (words and their frequencies)

In [37]:
bow_corpus = [dictionary.doc2bow(doc) for doc in trumpTweet_groupedDate['Tweet_lemmatized_stemmed']]
len(bow_corpus)

In [38]:
bow_doc_1147 = bow_corpus[1147]
for i in range(len(bow_doc_1147)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1147[i][0], 
                                               dictionary[bow_doc_1147[i][0]], 
bow_doc_1147[i][1]))

In [39]:
%sh pip install wordcloud

## Visualization of Words (Exploratory Data Analysis of Texts)

#### Visualizing the word variety in the tweet content

In [42]:

from wordcloud import WordCloud
stopwords = set(STOPWORDS)

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=100,
        max_font_size=30,
        scale=3,
        random_state=1)
   
    wordcloud=wordcloud.generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(trumpTweet_groupedDate['content'])

##Vectorizing and Visualizing texts

####Importing packages for topic modeling

In [45]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

####2) Visualizing Unigrams in the tweet contents before removing stop words

In [47]:
def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) 
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]
top_n_bigrams=get_top_ngram(trumpTweet_groupedDate['content'],1)[:10]
x,y=map(list,zip(*top_n_bigrams))
sns.barplot(x=y,y=x)

From the above Visualization we reckon that the frequency of the stopwords outcount the frequency of any potential significant word. Hence, defying the idea of direct proportionality between the number of words and its importance. In order to handle this situation we remove the stopwords and visualize the documents

####2) Visualizing Unigrams in the tweet contents after removing stop words

In [50]:
def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n), stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) 
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]
top_n_bigrams=get_top_ngram(trumpTweet_groupedDate['content'],1)[:10]
x,y=map(list,zip(*top_n_bigrams))
sns.barplot(x=y,y=x)

####Approach 1 (Topic Model created using Bag of words i.e. Words and their frequencies)

In [52]:
#Topic Modeling algorithm

lda_model = gensim.models.LdaMulticore(bow_corpus, id2word=dictionary, passes=4, workers=2)

In [53]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [54]:
print('The nltk version is {}.'.format(nltk.__version__))

In [55]:
#One compact function for cleaning/preprocessing text for using in the vectorizer
def clean_text(text):
    text_tc = ''.join([w if ord(w) < 128 else ' ' for w in text])
    text_lc = "".join([word.lower() for word in text_tc if word not in string.punctuation]) # remove puntuation
    text_rc = re.sub('[0-9]+', '', text_lc)
    #text_nc = re.sub(r'[^\x00-\x7F]+','', text_rc)
    tokens = re.split('\W+', text_rc)    # tokenization
    text = [ps.stem(word) for word in tokens if word not in stopword]  # remove stopwords and stemming
    return text

####Approach 2 (Topic model created using vecorized words)

**Note**: The disadvantage of the previous approach is that the model is trained upon only the words that it finds within the document and based on their frequencies. On the other hand the advantage of the subsequent model is that it creates a word vector dataframe using various possibilities of words (n-grams). This helps the machine learn on the words that are present and the words that could have been there (as absent words)

In [58]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [59]:
#Vectorizing the words
countVectorizer = CountVectorizer(analyzer=clean_text, token_pattern='[a-zA-Z0-9]{3,}', lowercase=True) 
countVector = countVectorizer.fit_transform(trumpTweet_groupedDate['content'])
print('{} Number of tweets has {} words'.format(countVector.shape[0], countVector.shape[1]))

In [60]:
countVector

In [61]:
type(corpus_tfidf)

### Word Vector

In [63]:
count_vect_df = pd.DataFrame(countVector.toarray(), columns=countVectorizer.get_feature_names())
count_vect_df

Unnamed: 0,Unnamed: 1,aaa,aap,aapour,ab,abandon,abba,abbaspictwittercomuzpivvrpgreat,abbott,abc,abcmr,abcnew,abcpoliticspictwittercomjixcsnzcpictwittercomyowcqlmad,abcwashington,abcworldnew,abduct,abdul,abdullah,abe,abedin,abepictwittercomeyyfwdqtoday,abepictwittercomlragojvdbplay,abeshinzo,abeshinzopictwittercomnwwxjlkxhthank,abeshinzopictwittercomqxmpcgrycfpastor,abeshinzopictwittercomuwejqnbexepictwittercomgraqvxrsmitch,abethank,abid,abig,abil,abl,ablepictwittercommctybguepi,aboard,abolish,abort,abound,aboutdepart,aboutth,aboutw,abraham,...,youtub,youv,youw,youwhat,youwhen,youwher,youwheth,youwhi,youwhil,youwho,youwil,yovanovitch,yrs,yrsdo,yucca,yulin,yuma,z,zaino,zapotoski,zealand,zealandpictwittercomtegqfkcmsjust,zeldin,zelenski,zell,zero,zfor,zika,zink,zito,zogbi,zoldan,zone,zoo,zoomappl,zoomsen,zte,zucker,zuckerberg,zuker
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [64]:
%sh pip install spacy

##Creating LDA model for topic modeling

In [66]:
import spacy as sp


In [67]:
# Materialize the sparse data
data_dense = countVector.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

In [68]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

In [69]:
lda_model = LatentDirichletAllocation(n_components = 20,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(countVector)

In [70]:
print(lda_model)

In [71]:
# Log Likelyhood: Higher the better (Likelyhood score tells us how good the model is performing. The more the magnitude the better the model)
print("Log Likelihood: ", lda_model.score(countVector))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(countVector))

# See model parameters
pprint(lda_model.get_params())

In [72]:
from sklearn.model_selection import GridSearchCV

####Running the Grid Search on LDA to find the best model

In [74]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(countVector)

In [75]:
# Best Model Parameters
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(countVector))

In [76]:
%sh pip install pyLDAvis

In [77]:
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [78]:
model.cv_results_

In [79]:
%sh pip install pyLDAvis

In [80]:
# Get Log Likelyhoods from Grid Search Output
n_components = [10, 15, 20, 25, 30]
log_likelyhoods_5 = [(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.5]
log_likelyhoods_7 = [(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.7]
log_likelyhoods_9 = [(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.9]
# log_likelyhoods_5 = [round(gscore.mean_score_time) for gscore in model.cv_results_['params'] if gscore['learning_decay']==0.5]
# log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.parameters['learning_decay']==0.7]
# log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.parameters['learning_decay']==0.9]


In [81]:
comp_log_5 = {'n_components': n_components, 'log_likelyhoods_5': log_likelyhoods_5}

###Comparing models visually

In [83]:
# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_components, log_likelyhoods_5, label='0.5')
plt.plot(n_components, log_likelyhoods_7, label='0.7')
plt.plot(n_components, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

In [84]:
#Dominant topic in each document

# Create Document - Topic Matrix
lda_output = best_lda_model.transform(countVector)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(bow_corpus))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
#df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.99,9
1,0.29,0.00,0.00,0.00,0.00,0.00,0.00,0.16,0.00,0.55,9
2,0.61,0.00,0.00,0.00,0.00,0.38,0.00,0.00,0.00,0.00,0
3,0.00,0.00,0.73,0.00,0.00,0.00,0.00,0.00,0.00,0.26,2
4,0.00,0.00,0.00,0.00,0.13,0.00,0.00,0.00,0.00,0.86,9
5,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.99,0.00,8
6,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.22,0.00,0.77,9
7,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.99,9
8,0.99,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0
9,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,9


In [85]:
#Review topics distribution across documents

df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,9,484
1,0,256
2,2,148
3,1,120
4,8,101
5,7,15
6,6,15
7,4,4
8,5,3
9,3,2


###Visualizing word vectors through PCAs

In [87]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, countVector, countVectorizer, mds='tsne')
panel

**Note : ** The vectors created from the words are clustered and plotted on the graph. Each circle represents a cluster of words under one topic. Distance between each circle is measured between centers. We made an interactive visualization that represents the list of words on the right in each topic (i.e. circle) on the left.

In [89]:
#Topics keywords
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = countVectorizer.get_feature_names()
df_topic_keywords.index = topicnames

# View
df_topic_keywords

Unnamed: 0,Unnamed: 1,aaa,aap,aapour,ab,abandon,abba,abbaspictwittercomuzpivvrpgreat,abbott,abc,abcmr,abcnew,abcpoliticspictwittercomjixcsnzcpictwittercomyowcqlmad,abcwashington,abcworldnew,abduct,abdul,abdullah,abe,abedin,abepictwittercomeyyfwdqtoday,abepictwittercomlragojvdbplay,abeshinzo,abeshinzopictwittercomnwwxjlkxhthank,abeshinzopictwittercomqxmpcgrycfpastor,abeshinzopictwittercomuwejqnbexepictwittercomgraqvxrsmitch,abethank,abid,abig,abil,abl,ablepictwittercommctybguepi,aboard,abolish,abort,abound,aboutdepart,aboutth,aboutw,abraham,...,youtub,youv,youw,youwhat,youwhen,youwher,youwheth,youwhi,youwhil,youwho,youwil,yovanovitch,yrs,yrsdo,yucca,yulin,yuma,z,zaino,zapotoski,zealand,zealandpictwittercomtegqfkcmsjust,zeldin,zelenski,zell,zero,zfor,zika,zink,zito,zogbi,zoldan,zone,zoo,zoomappl,zoomsen,zte,zucker,zuckerberg,zuker
Topic0,76.597976,0.1,2.798184,0.1,0.100011,2.794244,0.1,0.1,0.1,3.545452,0.100009,1.1,0.100005,0.1,0.100002,0.1,0.1,0.1,5.224402,0.100008,0.1,0.1,0.100007,0.1,0.1,1.099979,0.100003,0.100016,0.10001,1.1,14.286271,0.1,0.100005,0.100013,0.100046,0.1,0.1,3.1,1.099974,1.1,...,0.1,0.1,1.099999,1.1,0.100022,1.1,1.099994,0.100026,0.10001,0.1,1.099997,2.099998,2.099989,0.1,0.100004,0.1,1.148269,0.100009,1.1,0.100015,0.1,0.1,2.010159,8.099986,1.1,37.437872,0.1,1.1,0.1,0.1,0.1,0.1,2.744538,1.099929,0.100002,1.1,0.1,0.100013,1.1,0.1
Topic1,34.426335,1.099998,0.1,1.1,0.1,0.100009,1.1,1.1,0.1,8.95125,0.1,0.1,0.1,0.7285,0.100009,1.099982,0.1,1.1,3.665837,0.651845,0.1,0.1,8.085204,1.1,0.1,0.1,0.1,0.1,0.1,0.100478,8.819019,0.1,3.099955,0.1,2.098198,0.1,0.1,0.1,0.1,1.09997,...,1.099985,0.100008,0.1,0.1,0.1,0.1,0.1,0.1,1.050687,0.1,0.1,0.1,1.1,0.1,0.1,0.1,0.100005,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,3.172772,0.1,0.1,0.1,0.1,0.1,0.1,0.100011,0.100071,0.1,0.1,0.100011,0.1,0.1,1.099985
Topic2,24.11882,0.1,1.401815,0.1,0.1,1.894418,0.1,0.1,0.1,0.100009,0.1,0.1,0.1,0.1,0.1,0.1,1.1,0.1,9.743061,0.1,0.100036,1.1,1.114795,0.1,0.1,0.1,0.100023,0.1,0.1,0.100018,1.912617,1.099972,0.100011,2.829119,0.792412,0.100015,0.1,0.1,0.1,0.1,...,0.1,1.099985,0.100018,0.1,1.099978,0.1,0.1,0.100006,0.1,1.099995,0.1,0.1,0.100027,0.1,0.1,1.1,1.09998,0.1,0.1,0.1,3.584816,1.099982,0.1,0.1,0.1,6.859825,0.1,0.1,0.1,0.1,0.100039,0.1,0.100009,0.1,0.1,0.1,0.10004,0.1,0.1,0.1
Topic3,0.100014,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100007,0.1,1.099991,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,1.051728,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100031,0.1,0.1,0.1,0.1,0.1,0.1,0.1
Topic4,1.854665,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100004,0.1,0.1,1.099979,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.10001,0.1,0.1,0.1,0.101781,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,1.099986,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100012,0.1,0.100005,0.1,0.1,0.1,1.099991,0.1,0.1,0.1,0.1,1.099988,0.1,0.10005,0.1,0.1,0.1
Topic5,1.160717,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100005,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,1.086891,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.10006,0.1,0.1,0.100009,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
Topic6,8.71343,0.1,0.1,0.1,0.1,0.135595,0.1,0.1,0.1,0.100034,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,1.099966,0.1,0.1,0.1,0.132046,0.100028,0.1,0.111082,1.066742,0.1,0.1,0.1,0.1,0.1,...,0.1,1.100001,0.1,0.1,0.1,0.1,0.1,1.035822,0.1,0.1,0.1,0.1,0.1,0.1,1.099993,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,2.099986,0.1,0.1
Topic7,2.573154,0.1,0.1,0.1,1.099985,0.1,0.1,0.1,0.1,0.100015,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100017,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,1.099984,0.1,0.1,0.10001,0.1,0.1,0.100023,0.941622,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100018,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.10002,0.1,0.1,0.1,0.1,0.1,0.1,2.100015,0.1,0.1,0.1,0.1,0.1,0.1,0.1
Topic8,22.682905,0.1,2.099999,0.1,0.1,0.491152,0.1,0.1,0.1,0.10006,1.099991,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100018,0.1,0.1,0.1,5.099992,0.1,1.099996,0.1,0.1,0.1,0.1,1.100002,4.873207,0.1,0.100037,0.100021,3.036827,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,1.099972,0.1,0.1,0.1,0.1,0.1,0.1,1.099978,0.1,0.1,0.1,0.1,0.1,4.109573,0.1,0.1,0.1,0.1,0.1,0.1,2.100002,0.1,0.1,0.1,0.100003,0.1,0.1,0.1
Topic9,82.771985,0.100002,0.100002,0.1,0.100004,7.184583,0.1,0.1,1.1,23.803176,0.1,1.1,0.100016,1.4715,2.099989,0.100018,0.1,0.1,8.766665,1.548147,1.099964,0.1,0.100002,0.1,0.100004,0.100021,0.100008,0.1,2.09999,2.099501,38.576808,0.1,0.100001,3.359742,1.662371,1.099985,1.1,0.1,0.100026,0.10003,...,0.100015,0.100006,1.099997,0.1,0.1,0.1,0.100006,1.177256,2.149303,0.100005,0.100003,0.100002,0.100011,1.1,0.100003,0.1,0.1,2.099991,0.1,0.100006,1.615184,0.100018,1.189781,0.100002,0.1,43.919924,1.1,0.1,1.1,0.100009,1.099961,1.1,6.455394,0.1,0.10001,0.1,4.099896,0.100001,1.1,0.100015


In [90]:
#Getting top 15 keywords

# Show top n keywords for each topic
def show_topics(vectorizer= countVectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=countVectorizer, lda_model=best_lda_model, n_words=15)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,great,democrat,presid,impeach,peopl,republican,noth,get,vote,countri,us,state,go,border,thank
Topic 1,great,us,presid,china,state,year,trade,get,mani,work,countri,go,make,american,honor
Topic 2,great,peopl,state,countri,job,big,america,thank,border,work,new,make,american,get,love
Topic 3,elijah,cum,baltimor,corker,convinc,million,fort,myer,hurricaneharvey,contribut,inspir,racist,tennesse,space,marsha
Topic 4,switzerland,davo,rememb,harbor,pearl,daytona,enjoy,meddl,america,wisconsin,dnc,interview,econom,collect,foxandfriend
Topic 5,first,epa,honor,w,bless,nick,enforc,l,enter,law,polic,california,god,antitrump,jurisdict
Topic 6,american,thank,great,honor,commit,night,see,,vote,florida,berni,state,everyon,forward,obamacar
Topic 7,great,tax,go,must,korea,countri,peopl,cut,today,bad,north,nation,news,test,everi
Topic 8,great,news,fake,get,us,state,job,time,never,american,border,presid,peopl,work,republican
Topic 9,great,peopl,democrat,fake,news,presid,countri,trump,year,time,want,get,us,border,big


In [92]:
#Predicting topic of a new piece of text (i.e. a new tweet made by Donald Trump)

#Tokenization
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

#data_words = list(sent_to_words(data))

#print(data_words[:1])



In [93]:
%sh python3 -m spacy download en

In [94]:
import spacy

In [95]:
#Lemmatization
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
#data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#print(data_lemmatized[:2])

###Implementation of the Topic Model

In [97]:
# Define function to predict topic for a given text document.
nlp = spacy.load('en', disable=['parser', 'ner'])

def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization

    # Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))

    # Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Step 3: Vectorize transform
    mytext_4 = countVectorizer.transform(mytext_3)

    # Step 4: LDA Transform
    topic_probability_scores = best_lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()
    return topic, topic_probability_scores

# Predict the topic
mytext = ["I have instructed the United States Navy to shoot down and destroy any and all Iranian gunboats if they harass our ships at sea."]
topic, prob_scores = predict_topic(text = mytext)
print(prob_scores)

##Part 2: Getting stock prices of National Grid

In [99]:
national_grid_stock_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true').option("delimiter", ",").option("escape", "\"").load('/FileStore/tables/NGG.csv')

In [100]:
national_grid_stock_df.show()

###Data cleaning and rearrangement

In [102]:
#Retrieving the two columns from the existing tweet dataset (Data and tweet)
stock_tweet_df = trumpTweet_groupedDate[['date', 'content']]

In [103]:
stock_tweet_df['date'] =pd.to_datetime(stock_tweet_df.date)

In [104]:
#stock_tweet_df.sort_values(by = 'date')
stock_tweet_df.head()

Unnamed: 0,date,content
0,2018-01-01,HAPPY NEW YEAR! We are MAKING AMERICA GREAT AG...
1,2019-01-01,HAPPY NEW YEAR!pic.twitter.com/bHoPDPQ7G6-MEXI...
2,2020-01-01,"Get this straightened out, Governor @ GavinNew..."
3,2018-01-10,"Today, it was my great honor to sign a new Exe..."
4,2019-01-10,The Mainstream Media has NEVER been more disho...


In [105]:
Topics_df = df_document_topic[0:]

In [106]:
#Joining the two dataframes (tweets and topics)
Topic_stock_tweet_df = pd.concat([stock_tweet_df, df_document_topic], axis=1)

In [107]:
Topic_stock_tweet_df

Unnamed: 0,date,content,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
0,2018-01-01,HAPPY NEW YEAR! We are MAKING AMERICA GREAT AG...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.99,9
1,2019-01-01,HAPPY NEW YEAR!pic.twitter.com/bHoPDPQ7G6-MEXI...,0.29,0.00,0.00,0.00,0.00,0.00,0.00,0.16,0.00,0.55,9
2,2020-01-01,"Get this straightened out, Governor @ GavinNew...",0.61,0.00,0.00,0.00,0.00,0.38,0.00,0.00,0.00,0.00,0
3,2018-01-10,"Today, it was my great honor to sign a new Exe...",0.00,0.00,0.73,0.00,0.00,0.00,0.00,0.00,0.00,0.26,2
4,2019-01-10,The Mainstream Media has NEVER been more disho...,0.00,0.00,0.00,0.00,0.13,0.00,0.00,0.00,0.00,0.86,9
5,2020-01-10,"THANK YOU TOLEDO, OHIO!-Under my administratio...",0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.99,0.00,8
6,2018-01-11,The United States needs the security of the Wa...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.22,0.00,0.77,9
7,2019-01-11,Dear Diary...-Will be interviewed at the Borde...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.99,9
8,2020-01-11,Will be interviewed tonight by Laura @ Ingraha...,0.99,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0
9,2018-01-12,"Yesterday, I signed the # INTERDICTAct (H.R. 2...",0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,9


In [108]:
Topic_stock_tweet_df = Topic_stock_tweet_df.sort_values(by = 'date')

In [109]:
national_grid_pd = national_grid_stock_df.toPandas()

In [110]:
Topic_stock_tweet_df["Prices"] = ""

In [111]:
indx=0
for i in range (0,len(Topic_stock_tweet_df)):
    for j in range (0,len(national_grid_pd)):
        get_tweet_date=Topic_stock_tweet_df.date.iloc[i]
        get_stock_date=national_grid_pd.Date.iloc[j]
        if(get_stock_date==get_tweet_date):
            print(get_stock_date," ",get_tweet_date)
            Topic_stock_tweet_df.at(i,'Prices',national_grid_pd.Close[j])
            break

In [112]:
national_grid_pd['Date'] =pd.to_datetime(national_grid_pd.Date)
#type(national_grid_pd['Date'])

In [113]:
#Joining the Tweet_topic dataset with the stock price Dataset
Topic_stock_df = pd.merge(left=Topic_stock_tweet_df, left_on='date',
         right=national_grid_pd, right_on='Date', how = 'left')

In [114]:
Topic_stock_df.head()

Unnamed: 0,date,content,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic,Prices,Date,Open,High,Low,Close,Adj Close,Volume
0,2017-01-20,Thank you for joining us at the Lincoln Memori...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99,9,,NaT,,,,,,
1,2017-01-21,THANK YOU for another wonderful evening in Was...,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.87,9,,NaT,,,,,,
2,2017-01-22,Had a great meeting at CIA Headquarters yester...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,9,,NaT,,,,,,
3,2017-01-23,Busy week planned with a heavy focus on jobs a...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.94,9,,2017-01-23,63.810043,64.159386,63.722706,64.104805,54.674591,459000.0
4,2017-01-24,Will be meeting at 9:00 with top automobile ex...,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,,2017-01-24,63.799126,64.181221,63.777294,63.897381,54.497673,652000.0


In [115]:
Topic_stock_df_clean = Topic_stock_df.drop(Topic_stock_df.ix[:, 'Prices':'Low'].columns, axis = 1)

In [116]:
Topic_stock_df_clean = Topic_stock_df_clean.drop(columns = ['Adj Close', 'Volume'], axis = 1)

In [117]:
Topic_stock_df_clean.rename(columns={'Close': 'Prices'}, inplace=True)
Topic_stock_df_clean.head()

Unnamed: 0,date,content,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic,Prices
0,2017-01-20,Thank you for joining us at the Lincoln Memori...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99,9,
1,2017-01-21,THANK YOU for another wonderful evening in Was...,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.87,9,
2,2017-01-22,Had a great meeting at CIA Headquarters yester...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,9,
3,2017-01-23,Busy week planned with a heavy focus on jobs a...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.94,9,64.104805
4,2017-01-24,Will be meeting at 9:00 with top automobile ex...,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,63.897381


####Filling the empty stock price cells

In [119]:
#Forward filling the cells with stock price values i.e. Weekends are filled with Friday's values
Topic_stock_df_clean["Prices"].fillna( method ='ffill', inplace = True) 

In [120]:
Topic_stock_df_clean['Prices'] = Topic_stock_df_clean['Prices'].astype(float)

In [121]:
#Filling the rest of the empty cells with the mean value of the prices
Topic_stock_df_clean["Prices"].fillna(Topic_stock_df_clean.mean()['Prices'], inplace = True)

In [122]:
Topic_stock_df_clean["label"] = Topic_stock_df_clean["Prices"]

In [123]:
Topic_stock_df_clean.head()

Unnamed: 0,date,content,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic,Prices,label
0,2017-01-20,Thank you for joining us at the Lincoln Memori...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99,9,58.154025,58.154025
1,2017-01-21,THANK YOU for another wonderful evening in Was...,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.87,9,58.154025,58.154025
2,2017-01-22,Had a great meeting at CIA Headquarters yester...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,9,58.154025,58.154025
3,2017-01-23,Busy week planned with a heavy focus on jobs a...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.94,9,64.104805,64.104805
4,2017-01-24,Will be meeting at 9:00 with top automobile ex...,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,63.897381,63.897381


In [124]:
#Verifying skewness of the data
Topic_stock_df_clean.Prices.plot.hist(bins=15, alpha=0.5)

In [125]:
Topic_stock_df_clean.Prices.mean()

In [126]:
Topic_stock_df_clean.Prices.median()

In [127]:
Topic_stock_df_spark = spark.createDataFrame(Topic_stock_df_clean)

###Supervised Learning model for Price Prediction

In [129]:
#Random Forest Regressor

from pyspark.ml.feature import VectorAssembler

feature_list = []
for col in Topic_stock_df_clean.columns:
    if col == 'label' or col == 'Prices' or col == 'date' or col == 'content' or col == 'dominant_topic' or col == 'log_value' :
        continue
    else:
        feature_list.append(col)

assembler = VectorAssembler(inputCols=feature_list, outputCol="features")
print(feature_list)

In [130]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(labelCol="label", featuresCol="features")

In [131]:
#Pipline for assembler and randomforest regressor
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[assembler, rf])

In [132]:
#Configuring Grid Search Parameters
from pyspark.ml.tuning import ParamGridBuilder
import numpy as np

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 25, stop = 35, num = 5)]) \
    .addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 6, stop = 12, num = 5)]) \
    .addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

In [133]:
#Configuring Crossvalidator with RandomForest Pipeline, Grid Searchm and Regression Performance evaluator to get the best model
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)

In [134]:
%sh pip install mlflow

In [135]:
(trainingData, testData) = Topic_stock_df_spark.randomSplit([0.8, 0.2])

In [136]:
#Training the model
cvModel = crossval.fit(trainingData)


In [137]:
#Error calculation on the dataset by the best model
predictions = cvModel.transform(testData)
rmse_testData = evaluator.evaluate(predictions)
print("The rmse after test is %f" %rmse_testData)

In [138]:
best_model = cvModel.bestModel.stages[-1]

In [139]:
type(best_model)

In [140]:
type(cvModel)

In [141]:
display(predictions.toPandas().head())

date,content,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic,Prices,label,features,prediction
2017-01-28T00:00:00.000+0000,"I promise that our administration will ALWAYS have your back. We will ALWAYS be with you!pic.twitter.com/D0aOWhOH4X-The failing @ nytimes has been wrong about me from the very beginning. Said I would lose the primaries, then the general election. FAKE NEWS!-Thr coverage about me in the @ nytimes and the @ washingtonpost gas been so false and angry that the times actually apologized to its.....-...dwindling subscribers and readers.They got me wrong right from the beginning and still have not changed course, and never will. DISHONEST-Today, we remember the crew of the Space Shuttle Challenger, 31 years later. # NeverForgetpic.twitter.com/OhshQsFRfl",0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.82,9,63.187775,63.187775,"List(0, 10, List(3, 9), List(0.17, 0.82))",56.866608537654045
2017-02-07T00:00:00.000+0000,"An extended interview from the Super Bowl with @ oreillyfactor airs tonight at 8:00 P.M. Enjoy!pic.twitter.com/kZdHqaNTVR-The failing @ nytimes was forced to apologize to its subscribers for the poor reporting it did on my election win. Now they are worse!-The threat from radical Islamic terrorism is very real, just look at what is happening in Europe and the Middle-East. Courts must act fast!-I don't know Putin, have no deals in Russia, and the haters are going crazy - yet Obama can make a deal with Iran, #1 in terror, no problem!-An honor having the National Sheriffs' Assoc. join me at the @ WhiteHouse. Incredible men & women who protect & serve 24/7/365. THANK YOU!!pic.twitter.com/9EMTnH0OrF",0.0,0.0,0.51,0.0,0.0,0.0,0.0,0.0,0.0,0.48,2,64.323143,64.323143,"List(0, 10, List(2, 9), List(0.51, 0.48))",56.85758360329523
2017-02-10T00:00:00.000+0000,"SEE YOU IN COURT, THE SECURITY OF OUR NATION IS AT STAKE!-LAWFARE: ""Remarkably, in the entire opinion, the panel did not bother even to cite this (the) statute."" A disgraceful decision!-The failing @ nytimes does major FAKE NEWS China story saying ""Mr.Xi has not spoken to Mr. Trump since Nov.14."" We spoke at length yesterday!",0.0,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.71,9,65.371178,65.371178,"List(0, 10, List(4, 9), List(0.27, 0.71))",58.953257715563865
2017-02-13T00:00:00.000+0000,Today I will meet with Canadian PM Trudeau and a group of leading business women to discuss women in the workforce. -Welcome to the @ WhiteHouse Prime Minister @ JustinTrudeau!pic.twitter.com/WKgF8Zo9ri-Wonderful meeting with Canadian PM @ JustinTrudeau and a group of leading CEO's & business women from Canadaand the United Statespic.twitter.com/Rxr31QpxMK,0.33,0.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,65.382095,65.382095,"List(0, 10, List(0, 1), List(0.33, 0.64))",58.30448101976918
2017-02-23T00:00:00.000+0000,'S&P 500 Edges Higher After Trump Renews Jobs Pledge',0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.89,0.01,8,66.539299,66.539299,"List(1, 10, List(), List(0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.89, 0.01))",63.25728396424427


In [142]:
rmse

##Visualizing the error and actual price vs predicted prices

In [144]:
import matplotlib.pyplot as plt

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)

rfPred = cvModel.transform(Topic_stock_df_spark)
rmse = evaluator.evaluate(rfPred)
rfResult = rfPred.toPandas()

plt.plot(rfResult.label, rfResult.prediction, 'bo')
plt.xlabel('Price')
plt.ylabel('Prediction')
plt.suptitle("Model Performance RMSE: %f" % rmse)
plt.show()

In [145]:
plt.figure(figsize=(10, 8))
plt.plot(rfResult['date'], rfResult['Prices'], 'b-', label = 'Actual Prices')
plt.plot(rfResult['date'], rfResult['prediction'], 'r-', label = 'Predicted Prices')
plt.xlabel('Date'); plt.ylabel('Stock Prices ($)'); plt.title('National Grid Energy Industry')
plt.legend();

##Final Implementation of Predicting Prices from Tweets

In [147]:
# Define function to predict topic for a given text document.
nlp = spacy.load('en', disable=['parser', 'ner'])

def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization

    # Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))

    # Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Step 3: Vectorize transform
    mytext_4 = countVectorizer.transform(mytext_3)

    # Step 4: LDA Transform
    topic_probability_scores = best_lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()
    return topic, topic_probability_scores

# Predict the topic
mytext = ["In light of the attack from the Invisible Enemy, as well as the need to protect the jobs of our GREAT American Citizens, I will be signing an Executive Order to temporarily suspend immigration into the United States!"]
topic, prob_scores = predict_topic(text = mytext)
NewStock_df = pd.DataFrame(data=prob_scores, columns=["Topic0", "Topic1", "Topic2", "Topic3", "Topic4", "Topic5", "Topic6", "Topic7", "Topic8", "Topic9"])
print(topic)
print(prob_scores)
NewStock_df_spark = spark.createDataFrame(NewStock_df)
StockPred = cvModel.transform(NewStock_df_spark)
print("The predicted stock price of the Energy market is predicted to be : %f" %StockPred.toPandas().prediction)