# Data Description


United States 45th President Donald Trump has used Twitter as no one else. He primarily ran his government from a twitter firehose. Twitter has officially banned his account on January 8th 2021 after a deadly riot at Capitol on January 6th 2021. Twitter cites its World Leaders on Twitter: Principles and Approach as a guide to adhere to for public leaders.

Trump tweets and policies have far reaching effects that one can realize or he would accept to realize himself. Since, twitter is suspended there is no public way to read his past tweets and analyze it for public policy outcome or link it with global issues.

Here we are presenting the complete treasure trove of President Trump's tweet, all 56,572 for the public, data scientists and researchers.

The dataset contains 56,572 tweets, tweet IDs, Tweet Date, How many liked and retweeted it. 

**Please upvote if you find this notebook helpful! 😊 Thank you! I would also be very happy to receive feedback on my work.**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#ignore warning messages
import warnings
warnings.filterwarnings('ignore')
import plotly
plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
from wordcloud import WordCloud,STOPWORDS

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("../input/trumps-legacy/Trumps Legcy.csv")
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df=df.drop_duplicates(subset=['id','text'],keep='first')
df.shape

In [None]:
df['text_length']=df.text.apply(lambda x:len(x.split()))

In [None]:
df['text_length'].describe()

In [None]:
df['device'].value_counts()

In [None]:
px.histogram(df, x = 'device', width = 800, height = 500, title = 'Frequency of Tweets device')

In [None]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
import pandas as pd
import gensim
import pyLDAvis.gensim
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


nltk.download('wordnet')

pattern = r'\b[^\d\W]+\b'
tokenizer = RegexpTokenizer(pattern)
en_stop = get_stop_words('en')
lemmatizer = WordNetLemmatizer()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
stop  = {"https","t","co","u","s","rt"}

def converter(x):
    try: 
        return ' '.join([x.lower() for x in str(x).split() if x not in en_stop])
    except AttributeError:
        return None  # or some other value
    
    
def lematize(x):
    try:
        return ' '.join([lemmatizer.lemmatize(x)])
    except AttributeError:
        return None  # or some other value

def converterCustom(x):
    try: 
        return ' '.join([x.lower() for x in str(x).split() if x not in stop])
    except AttributeError:
        return None  # or some other value

In [None]:
df['text_without_stopwords'] = df['text'].apply(converter)
df['text_without_stopwords'] = df['text_without_stopwords'].apply(converterCustom)

In [None]:
texts = []
# loop through document list
for i in df['text_without_stopwords'].iteritems():
    # clean and tokenize document string
    raw = str(i[1]).lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [raw for raw in tokens if not raw in en_stop]
    
    # remove stop words from tokens
    #stopped_tokens_new = [raw for raw in stopped_tokens if not raw in remove_words]
    
    # lemmatize tokens
    lemma_tokens = [lemmatizer.lemmatize(tokens) for tokens in stopped_tokens]
    
    # remove word containing only single char
    new_lemma_tokens = [raw for raw in lemma_tokens if not len(raw) == 1]
    
    # add tokens to list
    texts.append(new_lemma_tokens)

# sample data
# print(texts[0])

df.head()

# Topic Modeling
## Latent Dirichlet allocation (LDA)

In [None]:
#Create term dictionary and document-term matrix
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)
import pprint
pprint.pprint(ldamodel.top_topics(corpus,topn=5))

In [None]:
%matplotlib inline
vis = pyLDAvis.gensim.prepare(topic_model=ldamodel, corpus=corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

# Word Cloud

In [None]:
 def showWordCloud(data):
    words = ' '.join(data)
    STOPWORDS.update(["https","t","co","u","s","rt"])    
    cleaned_word = " ".join([word for word in words.split()])
    wordcloud = WordCloud(stopwords = STOPWORDS,
                         background_color = 'black',
                         width = 2500,
                         height = 2500
                         ).generate(cleaned_word)
    plt.figure(1,figsize = (13,13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

showWordCloud(df['text_without_stopwords'])

# sentiment Analysis

In [None]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
df['sentiment'] = df.text_without_stopwords.map(lambda text: TextBlob(text).sentiment.polarity)
df.head(5)

In [None]:
# sns.distplot(df.sentiment);
# plt.title("Distribution of sentiment polarity of comments");

In [None]:
fig = px.histogram(df, x="sentiment",width = 800, height = 500,)
fig.update_layout(title_text='Distribution of sentiment polarity of comments',
                   xaxis_title_text='sentiment', 
    yaxis_title_text='Density')
fig.show()

In [None]:
 def label(x):
    try: 
        if x > 0.0:
            return 'POSITIVE'
        if x < 0.0:
            return 'NEGITIVE'        
        if x == 0.0:
            return 'NEUTRAL'
        
    except AttributeError:
        return None  # or some other value

df['sentimentLabel'] = df['sentiment'].apply(label)
df.head(5)

In [None]:
fig = px.histogram(df, x="sentimentLabel",width = 800, height = 500,)
fig.update_layout(title_text='Distribution of sentiment of comments',
                   xaxis_title_text='sentiment', 
    yaxis_title_text='Density')
fig.show()

# t-SNE

Rebuild LDA model with some extra imputs and reduce number of topics 

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, 
                                    num_topics=8, passes=5, minimum_probability=0)
ldamodel.print_topics()

Refactoring results of LDA into numpy matrix (number_of_papers x number_of_topics).

In [None]:
hm = np.array([[y for (x,y) in ldamodel[corpus[i]]] for i in range(len(corpus))])
hm

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(perplexity=30, early_exaggeration=120)
embedding = tsne.fit_transform(hm)
embedding = pd.DataFrame(embedding, columns=['x','y'])
embedding['hue'] = hm.argmax(axis=1)

In [None]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()

source = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            colors = [all_palettes['Set1'][8][i] for i in embedding.hue],
            title = df.text,
            year = df.date,
            alpha = [0.9] * embedding.shape[0],
            size = [7] * embedding.shape[0]
        )
    )
hover_tsne = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Text:</span>
            <span style="font-size: 12px">@title</span>
            <span style="font-size: 12px; font-weight: bold;">Date:</span>
            <span style="font-size: 12px">@year</span>
        </div>
    </div>
    """)
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(plot_width=700, plot_height=700, tools=tools_tsne, title='Tweets')
plot_tsne.circle('x', 'y', size='size', fill_color='colors', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source, name="df")

layout = column(plot_tsne)
show(layout)



## Work in progress tring to add intresting stuff. if you like my work do "up vote"