# Simple Cosine Similarity Analysis on Jair Bolsonaro tweets

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

from wordcloud import WordCloud
import matplotlib.pyplot as plt
from IPython.display import Image

In [None]:
df = pd.read_csv('../input/bolsonaro_tweets.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# We will use data only after the day Bolsonaro was elected as the President of Brazil
df = df[df['date'] >= '2018-10-28'].copy()

In [None]:
df.shape

In [None]:
df.head()

# Cleaning the text

Let's do some cleaning on the text before doing word clouds and using the scatter text library for visualization

In [None]:
def clean_df(df_clean):
    remove_names = False # if True, assumes you have a nomes.txt file with common brazilian names in your current dir
    remove_usernames = False
    
    # Copy the original text for later metadata
    df_clean['original_text'] = df_clean['text']

    # Lower case
    df_clean['text'] = df_clean['text'].apply(
        lambda x: " ".join(x.lower() for x in x.split()))

    # Remove usernames
    if remove_usernames:
        df_clean['text'] = df_clean['text'].str.replace(
            '@[^\s]+', "")

    # Remove links
    df_clean['text'] = df_clean['text'].str.replace(
        'https?:\/\/.*[\r\n]*', '')

    # Remove punctuation
    df_clean['text'] = df_clean['text'].str.replace(
        '[^\w\s]', '')

    # Remove stopwords
    from nltk.corpus import stopwords
    stop = stopwords.words('portuguese')
    df_clean['text'] = df_clean['text'].apply(
        lambda x: " ".join(x for x in x.split() if x not in stop))

    # Remove common brazilian names
    if remove_names:
        nomes = pd.read_csv('nomes.txt', encoding='latin', header=None)
        lista_nomes = (nomes[0].str.lower()).tolist()
        df_clean['text'] = df_clean['text'].apply(lambda x: " ".join(
            x for x in x.split() if x not in lista_nomes))

    # Remove numbers
    df_clean['text'] = df_clean['text'].str.replace(
        '\d+', '')

    # Remove words with 1-3 chars
    df_clean['text'] = df_clean['text'].str.replace(
        r'\b(\w{1,3})\b', '')

    # Replace accents and ç
    df_clean.text = df_clean.text.str.normalize('NFKD')\
        .str.encode('ascii', errors='ignore')\
        .str.decode('utf-8')
    
    return df_clean

In [None]:
df = clean_df(df)
df.head()

We see that some tweets disappeared as they were just emoji. We won't bother cleaning these rows as our libraries won't take them in consideration anyways. A future idea that we could implement is to substitute each emoji by a word that describes it.

# Word clouds

We're going to use [this](https://github.com/amueller/word_cloud) word cloud library to provide a beautiful visualization. I will keep the background _white_ in the _before_ dataframe and **dark** in the **after** dataframe just to help us visualize.

In [None]:
text = " ".join(review for review in df.text)
wordcloud = WordCloud(
    width=3000,
    height=2000,
    background_color='white').generate(text)
fig = plt.figure(
    figsize=(40, 30))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

## Most common words

In [None]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['text'])

In [None]:
word_count = pd.DataFrame(cv.get_feature_names(), columns=["word"])
word_count["count"] = count_matrix.sum(axis=0).tolist()[0]
word_count = word_count.sort_values("count", ascending=False).reset_index(drop=True)

In [None]:
freq_series = pd.Series.from_array(word_count['count'][:10])

x_labels = word_count['word'][:10]

plt.figure(figsize=(12, 8))
ax = freq_series.plot(kind='bar')
ax.set_xticklabels(x_labels)

rects = ax.patches
labels = word_count['count'][:10]

for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height + 5, label,
            ha='center', va='bottom')

In [None]:
df.head()

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
X = tfidf_vectorizer.fit_transform(df['text'])

In [None]:
print(tfidf_vectorizer.get_feature_names()[0:10])

In [None]:
print(X.shape)

In [None]:
cosine_similarity(X[0:1], X).shape

In [None]:
#fig, ax = plt.subplots(figsize=(20, 20))
# Drop self-correlations
#dropSelf = np.zeros_like(cos_sim)
#dropSelf[np.triu_indices_from(dropSelf)] = True
# Generate Color Map
#colormap = sns.diverging_palette(220, 10, as_cmap=True)
#sns.heatmap(cos_sim,mask=dropSelf,cmap=colormap)

In [None]:
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = cosine_similarity(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [None]:
df['text'][100]

In [None]:
for index, score in find_similar(X, 100):
       print("{} - {}".format(score, df['text'][index]))

In [None]:
df['text'][25]

In [None]:
for index, score in find_similar(X, 25):
       print("{} - {}".format(score, df['text'][index]))