In [None]:
import numpy as np 
import pandas as pd 
import re
import torch
import seaborn as sns
import warnings
import matplotlib.pyplot as plt 
%matplotlib inline

from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from transformers import (GPT2Config,GPT2LMHeadModel,GPT2Tokenizer)
from string import punctuation as pnc
from collections import Counter
from wordcloud import WordCloud

sns.set_style('darkgrid')
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [None]:
!pip install transformers

In [None]:
''' reading dataset '''
df = pd.read_csv('tweets.csv')

In [None]:
''' displaying 5 rows '''
df.head()

In [None]:
''' checking null values '''
df.isna().sum()

In [None]:
''' taking only 3 features '''
df = df[['handle','text','is_retweet']]

''' displaying rows '''
df.head()

In [None]:
''' valuec counts in handle column '''
handle_label = df['handle'].value_counts()

''' barplot '''
plt.figure(figsize=(5, 5))
sns.barplot(handle_label.index, handle_label);
plt.xlabel('Handle', fontsize=20)
plt.ylabel('Count', fontsize=20);

In [None]:
''' value count of is_retweet '''
is_retweet_label = df.is_retweet.value_counts()

''' barplot '''
plt.figure(figsize=(5, 5))
sns.barplot(is_retweet_label.index, is_retweet_label);
plt.xlabel('isretweet', fontsize=20)
plt.ylabel('Count', fontsize=20);

In [None]:
''' separrting donal and hillary tweets '''
realDonaldTrump = df[df.handle == 'realDonaldTrump']
hillaryClinton = df[df.handle == 'HillaryClinton']

In [None]:
''' donald tweet '''
realDonaldTrump.head()

In [None]:
''' hillary clinton tweets '''
hillaryClinton.head()

In [None]:
def get_word_cloud(df, c):
    cm = ' '
    s_word = set(STOPWbORDS)
    
    for sent in df[c]:
        ''' converting sent into string '''
        sent = str(sent)
        ''' spiltting every sent from (" ") '''
        tokens = sent.split()
        
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()
        
        ''' joining all tokesn '''
        cm += " ".join(tokens)
    
    word_cloud = WordCloud(width=800, height=400, background_color='black', stopwords=s_word,
                           min_font_size=10).generate(cm)
    
    plt.figure(figsize = (10, 10), facecolor = None) 
    plt.imshow(word_cloud) 
    plt.axis("off")
    plt.tight_layout(pad = 0) 
    plt.show()

In [None]:
''' hillary clinton '''
get_word_cloud(hillaryClinton,'text')

In [None]:
''' donal trump '''
get_word_cloud(realDonaldTrump, 'text')

In [None]:
''' extracting words that start with (@) '''
def extract_words(df, c):
    words = []
    for t in df[c].tolist():
        t = [x for x in t.split() if x.startswith('@')]
        words += t
    
    print(words[:10])

In [None]:
extract_words(realDonaldTrump, 'text')

In [None]:
extract_words(hillaryClinton, 'text')

In [None]:
''' extracting words that start with (#) '''
def extract_words_(df, c):
    words = []
    for t in df[c].tolist():
        t = [x for x in t.split() if x.startswith('#')]
        words += t
    
    print(words[:10])

In [None]:
extract_words_(realDonaldTrump, 'text')

In [None]:
extract_words_(hillaryClinton, 'text')

In [None]:
''' extracting words that start with (—) '''
def extract_words_(df, c):
    words = []
    for t in df[c].tolist():
        t = [x for x in t.split() if x.startswith('—')]
        words += t
    
    print(words[:10])

In [None]:
extract_words_(realDonaldTrump, 'text')

In [None]:
extract_words_(hillaryClinton, 'text')

In [None]:
''' Let's see tweets where Hillary mentioned herself '''

''' converting every into small letter '''
hillaryClinton['tweet_lower'] = hillaryClinton['text'].str.lower()

''' getting tweets '''
hillaryClinton[hillaryClinton['tweet_lower'].str.contains('hillary')]['text'].head()

In [None]:
''' Let's see tweets where Trump mentioned himself '''

''' converting every into small letter '''
realDonaldTrump['tweet_lower'] = realDonaldTrump['text'].str.lower()

''' getting tweets '''
realDonaldTrump[realDonaldTrump['tweet_lower'].str.contains('trump')]['text'].head()

In [None]:
''' removing all tags (@, #, -) '''
def remove_tags(t):
    text = " ".join([x for x in t.split(" ") if not x.startswith("@")])
    text = " ".join([x for x in text.split(" ") if not x.startswith("#")])
    text = " ".join([x for x in text.split(" ") if not x.startswith("—")])
    return text

In [None]:
''' preprocessing text'''

''' hillary '''
hillaryClinton['text_prepro'] = hillaryClinton['text'].str.replace('http\S+|www.\S+', '', case=False)
hillaryClinton['text_prepro'] = hillaryClinton['text_prepro'].str.replace('\n', '')
hillaryClinton['text_prepro'] = hillaryClinton['text_prepro'].map(remove_tags)

''' trump '''
realDonaldTrump['text_prepro'] = realDonaldTrump['text'].str.replace('http\S+|www.\S+', '', case=False)
realDonaldTrump['text_prepro'] = realDonaldTrump['text_prepro'].str.replace('\n', '')
realDonaldTrump['text_prepro'] = realDonaldTrump['text_prepro'].map(remove_tags)

In [None]:
hillaryClinton.head()

In [None]:
realDonaldTrump.head()

In [None]:
''' train test split '''
X_train, X_test = train_test_split(hillaryClinton['text_prepro'],test_size = 0.05)

In [None]:
!pip install simpletransformers==0.32.3

In [None]:
""" Training the Model. We will finetune GPT2 Model(Simple Transformer) using the Hillary's Tweets """
from simpletransformers.language_modeling import LanguageModelingModel

In [None]:
args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "num_train_epochs": 10,
    "train_batch_size": 32,
    "mlm": False,
    "dataset_type" : "simple",
    "block_size" : 24,
    "max_seq_length" : 24,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 50,
    "evaluate_during_training_verbose": True,
    "use_cached_eval_features": True,
    "save_eval_checkpoints" : False,
    "save_model_every_epoch" : False,
    "early_stopping_patience" : 2,
    "use_early_stopping" : True,
    "save_optimizer_and_scheduler " : False,
    "fp16" : False
}

model = LanguageModelingModel(
    'gpt2', 
    'gpt2',
    args=args
)

In [None]:
config, model, tokenizer = GPT2Config, GPT2LMHeadModel, GPT2Tokenizer

In [None]:
best_model  = model.from_pretrained('gpt2')

In [None]:
texts = ["I will reduce Gun violence.","Donald will build a wall","I will make our health care system better",
        "Come rally with us","America is in financial stress","We have to preserve secularism", "We will win the election"]

token = tokenizer.from_pretrained('gpt2')

for text in texts:
    enc_prompt = token.encode(texts, add_special_tokens=False, return_tensors="pt")
    gen = model.generate(encoded_prompt,max_length = 128, num_beams = 2, repetition_penalty = 5.0,verbose=False)
    gen = gen.tolist()[0]
    text = token.decode(gen, clean_up_tokenization_spaces=True)
    print(".".join(text.split(".")[:3]))