# Data Science for Good - Text classification using PySpark ML


This Notebook utilizes Machine Learning with <code>PySpark</code> to categorize disaster tweets. This is created for the [Kaggle competition](https://www.kaggle.com/c/nlp-getting-started).

In [None]:
!pip install pyspark

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Importing necessary libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover
from pyspark.sql.functions import col, udf,regexp_replace,isnull
from pyspark.sql.types import StringType,IntegerType
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, LogisticRegression, DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

### Create a Spark session

In [None]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

## Exploring Data Analysis
### Load the data files

In [None]:
filepath = '/kaggle/input/nlp-getting-started'
sdf_train = spark.read.csv(f'{filepath}/train.csv', header = True, inferSchema = True)
sdf_test = spark.read.csv(f'{filepath}/test.csv', inferSchema=True, header=True)

sdf_sample_submission = spark.read.csv(f'{filepath}/sample_submission.csv', 
                                       inferSchema=True, header=True)
sdf_train.printSchema()

**Predictor variables:** id, keyword, location, text

**Outcome variable:** target

Let's take a look at how the data looks.  
Pandas data frame is better than Spark DataFrame show() function.

In [None]:
import pandas as pd
pd.DataFrame(sdf_train.take(5), columns=sdf_train.columns)

In [None]:
print("Training Data Record Count:",sdf_train.count())
print("Test Data Record Count:",sdf_test.count())

In [None]:
sdf_train.toPandas().groupby(['target']).size()

The data is well balanced.

## Data Pre-processing


In [None]:
ml_df = sdf_train.select("id","text","target")
ml_df.show(5)

### Cleaning the dataset
#### Drop null values

In [None]:
ml_df = ml_df.dropna()
ml_df.count()

#### Removing numbers from the tweets

In [None]:
ml_df = ml_df.withColumn("only_str",regexp_replace(col('text'), '\d+', ''))
ml_df.show(5)

#### Segregating the words from the tweet

In [None]:
regex_tokenizer = RegexTokenizer(inputCol="only_str", outputCol="words", pattern="\\W")
raw_words = regex_tokenizer.transform(ml_df)
raw_words.show(5)

#### Removing the stop words from raw words

In [None]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
words_df = remover.transform(raw_words)
words_df.select("id","words","target","filtered").show(5, truncate=False)

#### Create a features column from the words

In [None]:
cv = CountVectorizer(inputCol="filtered", outputCol="features")
model = cv.fit(words_df)
countVectorizer_train = model.transform(words_df)
countVectorizer_train = countVectorizer_train.withColumn("label",col('target'))
countVectorizer_train.show(5)

In [None]:
countVectorizer_train.select('text','words','filtered','features','target').show()

### Separate the Train and Validation Data

In [None]:
(train, validate) = countVectorizer_train.randomSplit([0.8, 0.2],seed = 97435)

### Test Data

In [None]:
trainData = countVectorizer_train

#cleaning and preparing the test data
testData = sdf_test.select("id","text")#.dropna()
testData = testData.withColumn("only_str",regexp_replace(col('text'), '\d+', ''))
regex_tokenizer = RegexTokenizer(inputCol="only_str", outputCol="words", pattern="\\W")  #Extracting raw words
testData = regex_tokenizer.transform(testData)
remover = StopWordsRemover(inputCol="words", outputCol="filtered") #Removing stop words
testData = remover.transform(testData)
cv = CountVectorizer(inputCol="filtered", outputCol="features")
model = cv.fit(testData)
countVectorizer_test = model.transform(testData)
testData = countVectorizer_test
testData.show(5)

# Machine Learning Prediction Models
## Naive Bayes Classifier

In [None]:
nb = NaiveBayes(modelType="multinomial",labelCol="label", featuresCol="features")
nbModel = nb.fit(train)
nb_predictions = nbModel.transform(validate)

In [None]:
nbEval = BinaryClassificationEvaluator()
print('Test Area Under ROC', nbEval.evaluate(nb_predictions))

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions)
print("Accuracy of NaiveBayes is = %g"% (nb_accuracy))

## Logistic Regression Model


In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'target', maxIter=10)
lrModel = lr.fit(train)

We can obtain the coefficients by using <code>LogisticRegressionModel</code>’s attributes.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

beta = np.sort(lrModel.coefficients)
plt.plot(beta)
plt.ylabel('Beta Coefficients')
plt.show()

Summarize the model

In [None]:
trainingSummary = lrModel.summary
lrROC = trainingSummary.roc.toPandas()

plt.plot(lrROC['FPR'],lrROC['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

### Precision and recall

In [None]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

### Validate the model

In [None]:
lrPreds = lrModel.transform(validate)
lrPreds.select('id','prediction').show(5)

### Evaluate the Logistic Regression model

In [None]:
lrEval = BinaryClassificationEvaluator()
print('Test Area Under ROC', lrEval.evaluate(lrPreds))

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
lr_accuracy = evaluator.evaluate(lrPreds)
print("Accuracy of Logistic Regression is = %g"% (lr_accuracy))

## Decision Tree Classifier


In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'target', maxDepth = 3)
dtModel = dt.fit(train)
dtPreds = dtModel.transform(validate)
dtPreds.show(5)
#dtPreds.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

### Evaluate the Decision Tree model

One simple decision tree performed poorly because it is too weak given the range of different features. The prediction accuracy of decision trees can be improved by Ensemble methods, such as Random Forest and Gradient-Boosted Tree.

In [None]:
dtEval = BinaryClassificationEvaluator()
dtROC = dtEval.evaluate(dtPreds, {dtEval.metricName: "areaUnderROC"})
print("Test Area Under ROC: " + str(dtROC))

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
dt_accuracy = evaluator.evaluate(dtPreds)
print("Accuracy of Decision Trees is = %g"% (dt_accuracy))

### Make Predictions based on Decision Tree Model

In [None]:
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'target', maxDepth = 3)
dtModel = dt.fit(trainData)
dtPreds = dtModel.transform(testData)
dtPreds.show(5)

In [None]:
dtPreds.select('id','prediction').withColumnRenamed('prediction','target').toPandas().to_csv('dt_Pred.csv',index=False,header=True)

## Random Forest Classifier

from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'target')
rfModel = rf.fit(train)
rfPreds = rfModel.transform(validate)
rfPreds.select('id', 'rawPrediction', 'prediction', 'probability').show(10)

### Evaluate the Random Forest Classifier

rfEval = BinaryClassificationEvaluator()
rfROC = rfEval.evaluate(rfPreds, {rfEval.metricName: "areaUnderROC"})
print("Test Area Under ROC: " + str(rfROC))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
rf_accuracy = evaluator.evaluate(rfPreds)
print("Accuracy of Random Forests is = %g"% (rf_accuracy))

### Make Predictions using the Model

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(trainData)
rfPreds = rfModel.transform(testData)

In [None]:
#rfPreds.select('id', 'prediction').withColumnRenamed('prediction','target').toPandas()#.to_csv('rf_Preds.csv',index=False)

## Gradient Boosting Classifier

from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
gbtPreds = gbtModel.transform(validate)
gbtPreds.show(5)

### Evaluate the Gradient-Boosted Tree Classifier

gbtEval = BinaryClassificationEvaluator()
gbtROC = gbtEval.evaluate(gbtPreds, {gbtEval.metricName: "areaUnderROC"})
print("Test Area Under ROC: " + str(gbtROC))

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
gb_accuracy = evaluator.evaluate(gbtPreds)
print("Accuracy of GBT is = %g"% (gb_accuracy))

gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(trainData)
gbtPreds = gbtModel.transform(testData)
gbtPreds.select('id','prediction').show(5)

In [None]:
#gbtPreds.select('id', 'prediction').withColumnRenamed('prediction','target').toPandas().to_csv('gbt_Preds.csv',index=False)

**Thanks you for reading. I appreciate your time! I hope you find this useful.  
If you have any suggestions, please add them in the comments section or reach out to me on [LinkedIn.](https://www.linkedin.com/in/suraj-malpani/)**