In [2]:
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Tokenizer, RegexTokenizer

import googletrans
from googletrans import Translator

import pandas as pd

In [3]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [4]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [5]:
sc2 = SparkContext(appName = 'Projeto4')

In [6]:
spSession2 = SparkSession.builder.master("local[1]").getOrCreate()

In [7]:
def removePattern(inputText, pattern):
    r = re.findall(pattern, inputText)
    for i in r:
        inputText = re.sub(i, '', inputText)        
    return inputText

def cleanTweet(txt):
    '''
    Remove twitter return handles (RT @xxx:)
    '''
    txt = removePattern(txt, 'RT @[\w]*:')
    '''
    Remove twitter handles (@xxx)
    '''
    txt = removePattern(txt, '@[\w]*')
    '''
    Remove URL links (httpxxx)
    '''
    txt = removePattern(txt, 'https?://[A-Za-z0-9./]*')
    '''
    Remove special characters, numbers, punctuations
    '''
    txt = re.sub('[^A-Za-z]+', ' ', txt)
    return txt

def getCleanTweetText(filteredTweetText):
    return ' '.join(filteredTweetText)

def getSentimentScore(tweetText):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(tweetText)
    return float(vs['compound'])

def getSentiment(score):
    return 1 if score > 0 else 0


def getTweetArray(tweet):
    return tweet.split(' ')

In [8]:
dados2 = (spSession2.read
      .option("multiline", "true") # alguns registros estão em mais de uma linha
      .option("quote", '"') # informando que temos as aspas para separar o texto (q tem virgula dentro) para nao impactar o delimitador
      .option("header", "true") 
      .option("escape", '"') # separando caracteres especiais
      .csv('dados/chatgpt1.csv')
)

In [9]:
dados2.show(truncate = False)

+-------------------------+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+--------------------------------------------------------------+-----------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------+----------+------------+---------+----------+-------------------+--------+------------------------------------------------------------------------

In [10]:
pandasDF = dados2.toPandas()

In [25]:
len(pandasDF)

50001

In [11]:
pandasDF['Language'].unique()

array(['ja', 'en', 'de', 'tr', 'pl', 'fr', 'es', 'pt', 'lo', 'no', 'ca',
       'zh', 'qme', 'th', 'ne', 'ko', 'und', 'nl', 'fa', 'it', 'da', 'fi',
       'eu', 'hi', 'ar', 'sv', 'in', 'ru', 'qht', 'tl', 'hu', 'cs', 'uk',
       'iw', 'et', 'cy', 'bg', 'ht', 'el', 'vi', 'sl', 'kn', 'ro', 'lt',
       'ur', 'zxx', 'ml', 'mr', 'lv', 'gu', 'qam', 'is', 'ta', 'te', 'pa',
       'sd', 'am', 'sr', 'hy', 'or', 'bn'], dtype=object)

In [12]:
# 32076 estão em inglês
len(pandasDF[['Text','Language']].loc[pandasDF['Language'] == 'en'])

32076

In [13]:
# salvando em outro dataframe os textos diferentes de inglês
pandas_text = pandasDF[['Text','Language']].loc[pandasDF['Language'] != 'en']
pandas_text

Unnamed: 0,Text,Language
0,ChatGPTで遊ぶの忘れてた！！\n書類作るコード書いてみてほしいのと、\nどこまで思考整...,ja
2,"Schaut Euch an, was @fobizz @DianaKnodel alles...",de
4,"Profilinde vatan, Türkiye falan yazan bireyler...",tr
5,ChatGPT’nin bilinmeyen arka planı: Ucuz iş güc...,tr
12,最新コメント15件（01/22 22:45）\n\n【1月17日 登録記事】AzureでCh...,ja
...,...,...
49994,@FedeMoctezuma 5 cosas que puedes hacer con C...,es
49996,"#ChatGPT ist ein #Chatbot, der durch künstlich...",de
49997,@r8r Ich hab mal die AI dazu befragt (ChatGPT)...,de
49998,5 minuti di #chatGPT e ho capito che apprende ...,it


In [14]:
type(pandas_text)

pandas.core.frame.DataFrame

In [15]:
# Verificando se todos os idiomas estão no google translator
google_list = list(googletrans.LANGUAGES.keys())

In [16]:
language_list = [pandas_text.iloc[i][1] for i in range(len(pandas_text))] 
     

In [17]:
difference = list(set(language_list) - set(google_list))
difference

['qht', 'und', 'in', 'qam', 'zh', 'qme', 'zxx']

In [18]:
quantidade_zh = len(pandas_text.loc[pandas_text['Language'] == 'zh'])
quantidade_in = len(pandas_text.loc[pandas_text['Language'] == 'in'])
print(f'Language "zh": {quantidade_zh}')
print(f'Language "in": {quantidade_in}')

Language "zh": 149
Language "in": 251


In [19]:
len(pandasDF.loc[(pandasDF['Language'] =='qme') | (pandasDF['Language'] =='und')  | (pandasDF['Language'] =='qht')  | (pandasDF['Language'] =='zxx') | (pandasDF['Language'] =='qam')])

917

In [20]:
pandasDF.drop(pandasDF.loc[(pandasDF['Language'] =='qme') | (pandasDF['Language'] =='und')  | (pandasDF['Language'] =='qht')  | (pandasDF['Language'] =='zxx') | (pandasDF['Language'] =='qam')].index, inplace = True)

In [21]:
len(pandasDF)

49084

In [22]:
pandasDF['Language'].loc[pandasDF['Language'] == 'zh'] = 'zh-cn'

In [23]:
pandasDF['Language'].loc[pandasDF['Language'] == 'in'] = 'ms'

In [24]:
quantidade_zh_cn = len(pandasDF.loc[pandasDF['Language'] == 'zh-cn'])
quantidade_ms = len(pandasDF.loc[pandasDF['Language'] == 'ms'])
print(f'Language "zh_cn": {quantidade_zh_cn}')
print(f'Language "ms": {quantidade_ms}')

Language "zh_cn": 149
Language "ms": 251


In [25]:
# atualizando a outra tabela
pandas_text = pandasDF[['Text','Language']].loc[pandasDF['Language'] != 'en']
pandas_text.head(10)

Unnamed: 0,Text,Language
0,ChatGPTで遊ぶの忘れてた！！\n書類作るコード書いてみてほしいのと、\nどこまで思考整...,ja
2,"Schaut Euch an, was @fobizz @DianaKnodel alles...",de
4,"Profilinde vatan, Türkiye falan yazan bireyler...",tr
5,ChatGPT’nin bilinmeyen arka planı: Ucuz iş güc...,tr
12,最新コメント15件（01/22 22:45）\n\n【1月17日 登録記事】AzureでCh...,ja
13,Są dobre i złe wieści na temat ChatGPT.\n\n✅ O...,pl
14,#ChatGPT wydaje na operację w chmurze 3 mln do...,pl
16,J'ai juste demander à l'IA chatGPT de faire un...,fr
17,"Google Presenta Sparrow, Su Inteligencia Artif...",es
24,"esse chatGPT é bonzão pra explicar código, tá ...",pt


In [26]:
# listando todas os idiomas
language_list = [pandas_text.iloc[i][1] for i in range(len(pandas_text))] 
     

In [27]:
difference = list(set(language_list) - set(google_list))
difference

[]

In [28]:
# criar uma instância da classe Translator
translator = Translator()

In [29]:
arquivo_traducoes = pd.read_csv("dados/traducao_v2.csv", delimiter = ',', header = None, names = ["Text", "Language","Traduzido"], index_col = 0, skiprows = [0])

In [30]:
# Criando uma cópia do dataframe pandas
pandasDF_V2 = pandasDF.copy()

In [31]:
# Passando as traduções para os textos que não estão em inglês
pandasDF_V2['Text'].loc[pandasDF_V2['Language'] != 'en'] = arquivo_traducoes['Traduzido']

In [32]:
# Filtrando apenas os Textos (Tweets)
pandasDF_V2_text = pandasDF_V2['Text']
pandasDF_V2_text = pd.DataFrame(pandasDF_V2_text)
pandasDF_V2_text

Unnamed: 0,Text
0,I forgot to play ChatGPT! !\nI want you to wri...
1,@AlexandrovnaIng Prohibition of ChatGPT has be...
2,Check out what @fobizz @DianaKnodel brings to ...
3,Bow down to chatGPT 🫡..... https://t.co/ENTSzi...
4,The first thing that individuals who write hom...
...,...
49996,#ChatGPT is a #chatbot that uses artificial in...
49997,@r8r I asked the AI ​​about it (ChatGPT) https...
49998,5 minutes of #chatGPT and I realized that he l...
49999,Portland Shop Uses ChatGPT To Tell Family Stor...


In [33]:
spark_df = spSession2.createDataFrame(pandasDF_V2_text)

In [49]:
import pyspark.sql.functions as F

In [52]:
tweet_v1 = spark_df.withColumn("Clean", F.regexp_replace("Text", "RT @[\w]*:", ""))
tweet_v2 = tweet_v1.withColumn("Clean", F.regexp_replace("Clean", "@[\w]*", ""))
tweet_v3 = tweet_v2.withColumn("Clean", F.regexp_replace("Clean", "https?://[A-Za-z0-9./]*", ""))
dfCleanTweet = tweet_v3.withColumn("cleanTweetText", F.regexp_replace("Clean", "[^A-Za-z]+", " "))

In [34]:
udfCleanTweet = udf(cleanTweet, StringType())
dfCleanTweet= spark_df.withColumn('cleanTweetText', udfCleanTweet('text'))
dfCleanTweet.select('text','cleanTweetText').show(5)

+--------------------+--------------------+
|                text|      cleanTweetText|
+--------------------+--------------------+
|I forgot to play ...|I forgot to play ...|
|@AlexandrovnaIng ...| Prohibition of C...|
|Check out what @f...|Check out what br...|
|Bow down to chatG...|Bow down to chatGPT |
|The first thing t...|The first thing t...|
+--------------------+--------------------+
only showing top 5 rows



In [54]:
dfCleanTweet.select('cleanTweetText').show(truncate = False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|cleanTweetText                                                                                                                                                                                                                                                        |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|I forgot to play ChatGPT I want you to write a code to create a document How far can you organize your thoughts I want to try sea turtle soup Please do your research I messed around with it a bit at the o

In [53]:
tokenizer = Tokenizer(inputCol='cleanTweetText', outputCol='words')
dfCleanTweetTokenized = tokenizer.transform(dfCleanTweet)
dfCleanTweetTokenized.select('text','cleanTweetText','words').show(5)

+--------------------+--------------------+--------------------+
|                text|      cleanTweetText|               words|
+--------------------+--------------------+--------------------+
|I forgot to play ...|I forgot to play ...|[i, forgot, to, p...|
|@AlexandrovnaIng ...| Prohibition of C...|[, prohibition, o...|
|Check out what @f...|Check out what br...|[check, out, what...|
|Bow down to chatG...|Bow down to chatGPT |[bow, down, to, c...|
|The first thing t...|The first thing t...|[the, first, thin...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [54]:
remover = StopWordsRemover(inputCol='words', outputCol='filteredTweetText')
dfStopwordRemoved=remover.transform(dfCleanTweetTokenized)
dfStopwordRemoved.select('text','cleanTweetText','words','filteredTweetText').show(truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------

In [57]:
dfStopwordRemoved.select('filteredTweetText').show(truncate = False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|filteredTweetText                                                                                                                                                                                                                  |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[forgot, play, chatgpt, want, write, code, create, document, far, organize, thoughts, want, try, sea, turtle, soup, please, research, messed, around, bit, office, time, want, try, various, things]                               |
|[, prohibition, chatgpt, added, honor, code, daughter, school]                 

In [55]:
udfCleanTweetText = udf(getCleanTweetText, StringType())
dfFilteredCleanedTweet = dfStopwordRemoved.withColumn('filteredCleanedTweetText', udfCleanTweetText('filteredTweetText'))
dfFilteredCleanedTweet.select('filteredCleanedTweetText').show(5)

+------------------------+
|filteredCleanedTweetText|
+------------------------+
|    forgot play chatg...|
|     prohibition chat...|
|    check brings slop...|
|             bow chatgpt|
|    first thing indiv...|
+------------------------+
only showing top 5 rows



In [38]:
dfFilteredCleanedTweet.select('filteredCleanedTweetText').show(truncate = False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|filteredCleanedTweetText                                                                                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|forgot play chatgpt want write code create document far organize thoughts want try sea turtle soup please research messed around bit office time want try various things                             |
| prohibition chatgpt added honor code daughter school                                                                                                                                                |


In [46]:
dfFilteredCleanedTweet.filter(col("filteredCleanedTweetText")
    .rlike("\d")
  ).show()


+----+--------------+-----+-----------------+------------------------+
|Text|cleanTweetText|words|filteredTweetText|filteredCleanedTweetText|
+----+--------------+-----+-----------------+------------------------+
+----+--------------+-----+-----------------+------------------------+



In [56]:
udfSentimentScore = udf(getSentimentScore, FloatType())
dfSentimentScore = dfFilteredCleanedTweet.withColumn('sentimentScore', udfSentimentScore('filteredCleanedTweetText'))
dfSentimentScore.select('sentimentScore').show(truncate = False)

+--------------+
|sentimentScore|
+--------------+
|0.6486        |
|0.4939        |
|0.2023        |
|0.0           |
|0.5083        |
|0.0           |
|0.0           |
|-0.25         |
|0.25          |
|0.5106        |
|0.875         |
|0.5267        |
|0.0           |
|0.7184        |
|-0.5423       |
|0.4588        |
|0.0           |
|0.4767        |
|-0.1027       |
|0.6259        |
+--------------+
only showing top 20 rows



In [19]:
udfSentiment = udf(getSentiment, IntegerType())
dfSentiment = dfSentimentScore.withColumn('sentiment', udfSentiment('sentimentScore'))
dfSentiment.select('filteredCleanedTweetText','sentimentScore','sentiment').show(5)

+------------------------+--------------+---------+
|filteredCleanedTweetText|sentimentScore|sentiment|
+------------------------+--------------+---------+
|    forgot play chatg...|        0.6486|        1|
|     prohibition chat...|        0.4939|        1|
|    check brings slop...|        0.2023|        1|
|             bow chatgpt|           0.0|        0|
|    first thing indiv...|        0.5083|        1|
+------------------------+--------------+---------+
only showing top 5 rows



In [None]:
dfSentiment.groupBy('sentiment').count().show()

dfPlotVaderSentiment=dfSentiment.groupBy('sentiment').count().toPandas()
dfPlotVaderSentiment

In [None]:
dfCleanTweet.select('cleanTweetText').show(truncate =False)