In [16]:
import pandas as pd

Need of processing the multilines in CSV manually:

In [17]:
df = pd.read_csv('tweets.csv', sep=";", encoding="utf-16")

# Letter Casing
Converting all letters to either upper case or lower case.

In [18]:
df['full_text'] = df['full_text'].apply(lambda s: s.lower())
df['full_text'][0]

'participei do @canalmynews, ontem. falei sobre o comportamento de bolsonaro na assembleia  geral da onu: passamos um vexame terrível!conversamos, também, sobre mais temas da política. vale muito a pena ver. segue o link: https://t.co/mme7idvnnc https://t.co/ihuuvlb1ej'

# Tokenizing 
Turning the tweets into tokens. Tokens are words separated by spaces in a text.

In [19]:
from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer()
df['tk_text'] = df['full_text'].apply(lambda s: tknzr.tokenize(s))
df["tk_text"]

0       [participei, do, @canalmynews, ,, ontem, ., fa...
1       [@dicabairro, complicado, né, ,, se, fosse, só...
2       [o, bate, boca, ,, os, xingamentos, ,, as, acu...
3       [@bozistas, @steveemagal, @sabrinapinage, @elo...
4       [as, coisas, no, seu, lugar, -, hhhuuummm, ......
                              ...                        
2863    [@flareis, @deltanmd, #acaboubolsonaro, #bolso...
2864    [esse, cara, é, um, completo, babaca, ., sem, ...
2865    [@castilho_ivete, o, bolsonaro, esta, perdido,...
2866    [@johnnytb_, @paulo_souza35, @uol, bom, ,, se,...
2867    [ipec, :, lula, aparece, na, liderança, nos, d...
Name: tk_text, Length: 2868, dtype: object

# Noise removal
Eliminating unwanted characters, such as HTML tags, punctuation marks, special characters, white spaces etc.

In [20]:
import string

# remove punctuations
df["tk_text"] = df["tk_text"].apply(lambda l: [s.translate(str.maketrans('','',string.punctuation)) for s in l])

# remove noise
df["tk_text"] = df["tk_text"].apply(lambda l: [s for s in l if s.isalpha()])
df["tk_text"]

0       [participei, do, canalmynews, ontem, falei, so...
1       [dicabairro, complicado, né, se, fosse, só, o,...
2       [o, bate, boca, os, xingamentos, as, acusações...
3       [bozistas, steveemagal, sabrinapinage, lauroja...
4       [as, coisas, no, seu, lugar, hhhuuummm, empres...
                              ...                        
2863    [flareis, deltanmd, acaboubolsonaro, bolsonaro...
2864    [esse, cara, é, um, completo, babaca, sem, ide...
2865    [castilhoivete, o, bolsonaro, esta, perdido, e...
2866    [johnnytb, uol, bom, se, vê, que, morreu, mais...
2867    [ipec, lula, aparece, na, liderança, nos, dois...
Name: tk_text, Length: 2868, dtype: object

# Stopword removal
Some words do not contribute much to the machine learning model, so it's good to remove them. A list of stopwords can be defined by the nltk library, or it can be business-specific.

In [21]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
swords = set(stopwords.words('portuguese') + ["q", "né", "vc", "c", "n", "vcs", "eh", "pra"])
df["tk_text"] = df["tk_text"].apply(lambda l: [s for s in l if s not in swords])
df["tk_text"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexande\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0       [participei, canalmynews, ontem, falei, sobre,...
1       [dicabairro, complicado, supremo, votamos, bol...
2       [bate, boca, xingamentos, acusações, revelaçõe...
3       [bozistas, steveemagal, sabrinapinage, lauroja...
4       [coisas, lugar, hhhuuummm, empresa, exesposa, ...
                              ...                        
2863    [flareis, deltanmd, acaboubolsonaro, bolsonaro...
2864    [cara, completo, babaca, ideias, nenhuma, melh...
2865    [castilhoivete, bolsonaro, perdido, sabe, lula...
2866    [johnnytb, uol, bom, vê, morreu, bolsonaristas...
2867    [ipec, lula, aparece, liderança, dois, cenário...
Name: tk_text, Length: 2868, dtype: object

# Stemming
Stemming is the process of reducing the word to its root word. For example, the word "running" is reduced to "run".

In [22]:
from nltk.stem import RSLPStemmer
nltk.download('rslp')
stemmer = RSLPStemmer()
df["stemmed_text"] = df["tk_text"].apply(lambda l: [stemmer.stem(s) for s in l])
df["stemmed_text"] 

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\alexande\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


0       [particip, canalmynew, ont, fal, sobr, comport...
1       [dicabairr, complic, supr, vot, bolsonar, mud,...
2       [bat, boc, xing, acus, revel, crim, entr, sen,...
3       [bozist, steveemag, sabrinapinag, laurojardim,...
4       [cois, lug, hhhuuummm, empr, exesp, capitã, bo...
                              ...                        
2863    [flarel, deltanmd, acaboubolsonar, bolsonarove...
2864    [car, complet, babac, ide, nenhum, melhor, paí...
2865    [castilhoivet, bolsonar, perd, sab, lul, lid, ...
2866    [johnnytb, uol, bom, vê, morr, bolsonar, bolso...
2867    [ipec, lul, aparec, lideranç, doi, cen, corr, ...
Name: stemmed_text, Length: 2868, dtype: object

In [23]:
df.to_csv("clean_tweets.csv", sep=";", encoding="utf-16", index=False)