In [None]:
import string
import pandas as pd
import nltk
import re

nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def clean_text(text):
    # remove users tags and url
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())
    # remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    # tokenize
    text = re.split('\W+', text)
    # remove stop-word
    stopword = nltk.corpus.stopwords.words('english')
    text = [word for word in text if word not in stopword]
    # Stemming
    ps = nltk.PorterStemmer()
    text = [ps.stem(word) for word in text]
    # Lammitization
    wn = nltk.WordNetLemmatizer()
    text = [wn.lemmatize(word) for word in text]
    # remove empty words
    text = [word for word in text if word != '']
    # rejoin for easier one-hot extraction
    text = ' '.join(text)
    # remove one letters words
    new_text = []
    for w in text.split(" "):
        if len(w) > 1:
            new_text.append(w)
    text = " "
    return text.join(new_text)

# Load and preapre the dataset

In [None]:
data = pd.read_csv(
     "training.1600000.processed.noemoticon.csv",
     header=None,
     encoding='latin-1',
     usecols=[0, 5])

# available columns are [0,1,2,4,5]=['polarity', 'id', 'date', 'user', 'text']

data.columns = ['polarity', 'text']
data['polarity'] = pd.to_numeric(data['polarity'], downcast='integer')

# 0 ->[1,0] negative ou 0, 4 ->[0,1] positive ou 1
data.polarity = data.polarity.replace({0: 0, 4: 1})

# Cleanse the data

In [None]:
data['clean_text'] = data['text'].apply(lambda x: clean_text(x))

# Save dataframe to cleaned_data.csv

In [None]:
data.to_csv(r'cleaned_data.csv', index = False)