In [None]:
!pip install pyspark

In [None]:
from pyspark import SparkConf, SparkContext, SQLContext
import re

import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
import numpy as np


In [None]:
# Configuration
conf = SparkConf()
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [None]:
train_rdd = sc.textFile("path to train.csv")
test_rdd = sc.textFile("path to test.csv")

# Convert to rdd
def split(line):
  label = int(line[1])
  sentence = line[5:-1]
  return label, sentence

train_rdd = train_rdd.map(split)
test_rdd = test_rdd.map(split)

# Get the first 10 rows
for row in train_rdd.take(10):
  print(row)

In [None]:
# Preprocess
nltk.download('stopwords')

# Remove special characters
pattern = r'[^a-zA-Z0-9\s]'     # Define a pattern that only includes whitespaces and alphanumeric characters
def remove_special_characters(text):
    text = re.sub(pattern, ' ', text)     # Replace characters not belonging to the pattern with whitespace
    return text.replace('\n', ' ')      # Replace '\n' with whitespace

# Remove indepedent numbers and stopwords
def is_number(s):
    if s.isdigit():
        return True
    try:
        float(s)
        return True
    except:
        return False
stop_words = set(stopwords.words('english'))
stop_words.add('')
def remove_numbers_and_stopwords(text):
    return " ".join([x for x in text.split(' ') if not is_number(x) and x not in stop_words])

# Remove abundant spaces
def remove_extra_spaces(text):
    return " ".join(text.split(' '))

# Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split(" ")])

def preprocess(text):
    text = remove_special_characters(text)
    text = text.lower()
    text = remove_numbers_and_stopwords(text)
    text = remove_extra_spaces(text)
    text = lemmatize(text)
    return text

# Remove rows with empty word lists
def filter_empty_and_none(row):
    return row[1] is not None and len(row[1]) > 0

train_preprocessed_rdd = (
    train_rdd
    .map(lambda x : (x[0], preprocess(x[1])))
    .filter(filter_empty_and_none)
)
test_preprocessed_rdd = (
    test_rdd
    .map(lambda x : (x[0], preprocess(x[1])))
    .filter(filter_empty_and_none)
)

for row in test_preprocessed_rdd.take(10):
  print(row)

In [None]:
sc.stop()