In [1]:
import spacy
import nltk
 
# Initialize spacy 'en' model, keeping only component needed for lemmatization and creating an engine
# https://spacy.io/models
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [2]:
# Show table with pandas
import pandas as pd
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,index,review_title,review_text
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...


In [3]:
review_text = df.iloc[0]['review_text']
review_text

'Gave this to my dad for a gag gift after directing "Nunsense," he got a reall kick out of it!'

In [4]:
# Using nltk tokenizer
from nltk.tokenize import word_tokenize
import nltk
 
# Download punkt module
nltk.download('punkt')
print(word_tokenize(review_text))

[nltk_data] Downloading package punkt to
[nltk_data]     /home/vhugobarnes/nltk_data...


['Gave', 'this', 'to', 'my', 'dad', 'for', 'a', 'gag', 'gift', 'after', 'directing', '``', 'Nunsense', ',', "''", 'he', 'got', 'a', 'reall', 'kick', 'out', 'of', 'it', '!']


[nltk_data]   Package punkt is already up-to-date!


In [5]:
import re
 
# Tokenization and remove punctuations
words = [str(token) for token in nlp(review_text) if not token.is_punct]
 
# Remove digits and others symbols except @ (emails)
words = [re.sub(r"[^A-Za-z@]", "", word) for word in words]
 
# Remove websites and email address
words = [re.sub(r"\S+com", "", word) for word in words]
words = [re.sub(r"\S@\S+", "", word) for word in words]
 
# Remove empty spaces
words = [word for word in words if word != ' ']
 
words

['Gave',
 'this',
 'to',
 'my',
 'dad',
 'for',
 'a',
 'gag',
 'gift',
 'after',
 'directing',
 'Nunsense',
 'he',
 'got',
 'a',
 'reall',
 'kick',
 'out',
 'of',
 'it']

In [6]:
# Import stopwords from nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

# Change all stopwords into lowercase
stopwords_lower = [s.lower() for s in stopwords]

# Exclude stopwords from the reviews
words = [word.lower() for word in words if word.lower() not in stopwords_lower]
words

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vhugobarnes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['gave', 'dad', 'gag', 'gift', 'directing', 'nunsense', 'got', 'reall', 'kick']

In [7]:
# Lemmatization
example_phrase = "the cats are playing in the yard"
words = [token.lemma_ for token in nlp(example_phrase) if not token.is_punct]
words

['the', 'cat', 'be', 'play', 'in', 'the', 'yard']

In [8]:
def text_preprocessing(str_input):
  # Tokenization, remove punctuation, lemmatization
  words = [token.lemma_ for token in nlp(str_input) if not token.is_punct]

  # Remove symbols, websites and email addresses
  words = [re.sub(r"[^A-Za-z@]", "", word) for word in words]
  words = [re.sub(r"\S+com", "", word) for word in words]
  words = [re.sub(r"\S+@\S+", "", word) for word in words]
  words = [word for word in words if word != ' ']
  words = [word for word in words if len(word) != 0]

  # Remove stopwords
  words = [word.lower() for word in words if word.lower() not in stopwords_lower]

  # Combine a list into one string
  string = " ".join(words)

  return string

In [13]:
import csv

#? We create a new file where the preprocessed reviews will be written on
preprocessed_train_file = open("preprocessed_train.csv", "a")
# Write down headers
preprocessed_train_file.write("index,review_title,review_text\n");

#? Open the train.csv file
train_csv = open("train.csv", "r")

# We pass the first line to
csv_reader = csv.reader(train_csv)

# Loop though the file and preprocess data
#! Just 10,000 rows because 600,000 are too much for a regular computer
counter = 0
for row in csv_reader:
  if counter <= 10000:
    index = row[0]
    review_title = text_preprocessing(row[1])
    review_text = text_preprocessing(row[2])

    new_row = '{},{},{}\n'.format(index, review_title, review_text)
    preprocessed_train_file.write(new_row)
    counter += 1
  else:
    break

preprocessed_train_file.close()
train_csv.close()

In [14]:
#? We create a new file where the preprocessed reviews will be written on
preprocessed_test_file = open("preprocessed_test.csv", "a")
# Write down headers
preprocessed_test_file.write("index,review_title,review_text\n");

#? Open the train.csv file
test_csv = open("test.csv", "r")

# We pass the first line to
csv_reader = csv.reader(test_csv)

# Loop though the file and preprocess data
#! Just 10,000 rows because 600,000 are too much for a regular computer
counter = 0
for row in csv_reader:
  if counter <= 10000:
    index = row[0]
    review_title = text_preprocessing(row[1])
    review_text = text_preprocessing(row[2])

    new_row = '{},{},{}\n'.format(index, review_title, review_text)
    preprocessed_test_file.write(new_row)
    counter += 1
  else:
    break

preprocessed_test_file.close()
test_csv.close()