### Stack overflow tag classification

Importing packages ...

In [490]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql import Row
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark import SparkContext
from pyspark.ml.feature import StringIndexer, VectorIndexer,IndexToString

In [491]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [492]:
import pandas as pd
from bs4 import BeautifulSoup as BS
import re

### Data Overview

In [493]:
data = pd.read_csv("/home/shreyas/GeminiData/seed.csv")

In [494]:
data.columns

Index(['_Id', '_PostTypeId', '_CreationDate', '_Score', '_ViewCount', '_Body',
       '_OwnerUserId', '_LastActivityDate', '_Title', '_Tags', '_AnswerCount',
       '_CommentCount', '_FavoriteCount', '_LastEditorUserId',
       '_AcceptedAnswerId', '_LastEditDate', '_ParentId', '_Category'],
      dtype='object')

In [495]:
data=data[['_Body','_Category']]

In [514]:
testdata = pd.read_csv("/home/shreyas/GeminiData/input_data.csv")
testdata.columns

Index(['_Id', '_PostTypeId', '_CreationDate', '_Score', '_ViewCount', '_Body',
       '_OwnerUserId', '_LastActivityDate', '_Title', '_Tags', '_AnswerCount',
       '_CommentCount', '_FavoriteCount', '_LastEditorUserId',
       '_AcceptedAnswerId', '_LastEditDate', '_ParentId', 'Unnamed: 17',
       'Unnamed: 18', 'Unnamed: 19'],
      dtype='object')

In [515]:
testdata=testdata[['_Body']]

In [498]:
data.head(5)

Unnamed: 0,_Body,_Category
0,"<p>Are questions related to <a href=""http://ww...",bricks
1,<p>What is a good tag for purchasing/acquiring...,bricks
2,"<p>I've asked one, so <a href=""https://bricks....",bricks
3,<p>Lego Mindstorms allows one to write embedde...,bricks
4,<p>I suspect that Mindstorms by itself is not ...,bricks


In [516]:
testdata.head(5)

Unnamed: 0,_Body
0,"<p>Are questions related to <a href=""http://ww..."
1,<p>What is a good tag for purchasing/acquiring...
2,"<p>I've asked one, so <a href=""https://bricks...."
3,<p>Lego Mindstorms allows one to write embedde...
4,<p>I suspect that Mindstorms by itself is not ...


In [500]:
data.isnull().values.any()

False

In [501]:
testdata.isnull().values.any()

True

Testdata contains missing values

In [502]:
data['_Category'].unique().shape

(10,)

In [503]:
data.groupby(['_Category']).count()

Unnamed: 0_level_0,_Body
_Category,Unnamed: 1_level_1
3dprinting,10
agur,10
agur.meta,10
ai,9
arabic,10
arabic.meta,11
avp,10
beer,9
bioinformatics,10
bricks,10


### Object oriented flow

Defining a parent class __categoryPrediction__ which performs data reading, cleaning and transformation to make them suitable for classification models

In [612]:
class categoryPrediction:


    # Method to read training data and extracting only useful columns (body and category) from it.
    # This method takes one argument, filename and returns spark dataframe.
    
    def readTrain(self,file):
        
        #Reading file as pandas dataframe.
        data = pd.read_csv(file)
        data=data[['_Body','_Category']]
        
        # This function is defined later in the same class.
        trainDf=self.readContent(data)
        
        # Converting pandas dataframe to spark dataframe.
        spark_df = sqlContext.createDataFrame(data[['body','_Category']])
        return spark_df
  



   # As testdata doesn't have labels, the same function can't be used for test file. So another function is
        # defined which extracts only body column. This function also returns spark dataframe.
    
    def readTest(self,file):
        testdata = pd.read_csv(file)
        testdata=testdata[['_Body']]
        
        # Testdata has some values missing. The rows which doesn't have data are removed.
        testdata=testdata.dropna()
        trainDf=self.readContent(testdata)
        spark_df_test = sqlContext.createDataFrame(testdata[['body']])
        return spark_df_test

    # This function is referenced in the previous functions. The column '_body' contains data in the html tag 
        # format. This function extracts the text data from the _body column. This function returns cleaned dataframe.
        
    def readContent(self,data):
        for j in range(data.shape[0]):
            soup=BS(data._Body.iloc[j])
            s=''
            
            # The text data is contained in the paragraphs only. So the text inside that tag is extracted using
            # Beautifulsoup library. There are multiple paragraph tags. The text from all of them is Concatenated.
                
            for i in range(len(soup.findAll(lambda tag: tag.name == 'p' ))):
                s+=(soup.find_all('p')[i].get_text())+' '
            s=s[:-1] 
            data.loc[data.index[j], 'body'] = s
        return data
    
    # This method is used to clean the data using spark. This function returns vectorized features.
    # This method can be used for both training and test data.
    
    def clean(self,df):
        
        # This function keeps only words and tokenize them. Other contents like symbols, digits are removed.
        
        regexTokenizer = RegexTokenizer(inputCol="body", outputCol="words", pattern="\\W")
        regexTokenized = regexTokenizer.transform(df)
        
        # This function removes stop words which are irrelevant for classification.
        
        remover = StopWordsRemover(inputCol="words", outputCol="filtered")
        sw=remover.transform(regexTokenized)
        
        # TF-IDF is used to convert tokenized words into the integer features.
        
        hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
        featurizedData = hashingTF.transform(sw)
        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(featurizedData)
        rescaledData = idfModel.transform(featurizedData)
        return rescaledData
        
    
    # This method is used to convert string labels into integer labels. 
    
    def categoryToLabel(self,df):
        indexer = StringIndexer(inputCol="_Category", outputCol="categoryIndex")
        train = indexer.fit(df).transform(df)
        return train

    # This method is used to convert integer labels created using previous method back to the string category(Tag)
    
    def labelToCategory(self,df):
        labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=indexer.labels)
        output=labelConverter.transform(df)
        return output
    
    
    # This method is used to split labeled data into training and evaluation (validation) sets.
    # 70% of the total data is used for training and 30% is used for validation.
    # This function returns both the dataframes created.
    
    def split(self,df):
        (train, ev) = df.randomSplit([0.7, 0.3])
        return (train, ev)
        

Let's try to understand what exactly is happening after every operation performed on the training data (Following steps are not the part of class. We are just holding object oriented flow to understand the operations)

In [507]:
spark_df = sqlContext.createDataFrame(data[['body','_Category']])
spark_df.show(5)

+--------------------+---------+
|                body|_Category|
+--------------------+---------+
|Are questions rel...|   bricks|
|What is a good ta...|   bricks|
|I've asked one, s...|   bricks|
|Lego Mindstorms a...|   bricks|
|I suspect that Mi...|   bricks|
+--------------------+---------+
only showing top 5 rows



In [412]:
spark_df_test = sqlContext.createDataFrame(testdata[['body']])
spark_df_test.show(5)

+--------------------+
|                body|
+--------------------+
|Are questions rel...|
|What is a good ta...|
|I've asked one, s...|
|Lego Mindstorms a...|
|I suspect that Mi...|
+--------------------+
only showing top 5 rows



In [486]:
regexTokenizer = RegexTokenizer(inputCol="body", outputCol="words", pattern="\\W")
regexTokenized = regexTokenizer.transform(spark_df)
regexTokenized_test = regexTokenizer.transform(spark_df_test)
regexTokenized.show(2)

+--------------------+---------+--------------------+
|                body|_Category|               words|
+--------------------+---------+--------------------+
|Are questions rel...|   bricks|[are, questions, ...|
|What is a good ta...|   bricks|[what, is, a, goo...|
+--------------------+---------+--------------------+
only showing top 2 rows



In [487]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
sw=remover.transform(regexTokenized)
sw_test=remover.transform(regexTokenized_test)
sw.show(2)

+--------------------+---------+--------------------+--------------------+
|                body|_Category|               words|            filtered|
+--------------------+---------+--------------------+--------------------+
|Are questions rel...|   bricks|[are, questions, ...|[questions, relat...|
|What is a good ta...|   bricks|[what, is, a, goo...|[good, tag, purch...|
+--------------------+---------+--------------------+--------------------+
only showing top 2 rows



#### TF-IDF - Term frequency - inverse document frequency

TF - number of times term is appearing in document <br>
IDF - log of inverse fraction of documents having term t with total number of document.

In [488]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=30)
featurizedData = hashingTF.transform(sw)
featurizedData_test = hashingTF.transform(sw_test)

# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
idfModel_test = idf.fit(featurizedData_test)

rescaledData = idfModel.transform(featurizedData)
rescaledData_test = idfModel_test.transform(featurizedData_test)

rescaledData.show(2)

+--------------------+---------+--------------------+--------------------+--------------------+--------------------+
|                body|_Category|               words|            filtered|         rawFeatures|            features|
+--------------------+---------+--------------------+--------------------+--------------------+--------------------+
|Are questions rel...|   bricks|[are, questions, ...|[questions, relat...|(30,[1,2,4,7,14,2...|(30,[1,2,4,7,14,2...|
|What is a good ta...|   bricks|[what, is, a, goo...|[good, tag, purch...|(30,[1,7,8,9,13,1...|(30,[1,7,8,9,13,1...|
+--------------------+---------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



Other options for feature engineering: <br>
    1) n-grams <br>
    2) bag of words <br>
    3) Word2Vec

In [419]:
final=rescaledData.select("_Category","features")
final.show(2)

+---------+--------------------+
|_Category|            features|
+---------+--------------------+
|   bricks|(20,[1,2,4,7,10,1...|
|   bricks|(20,[3,5,7,8,9,11...|
+---------+--------------------+
only showing top 2 rows



In [534]:
indexer = StringIndexer(inputCol="_Category", outputCol="categoryIndex").fit(final)
train = indexer.transform(final)
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=indexer.labels)

In [535]:
train.show(5)

+---------+--------------------+-------------+
|_Category|            features|categoryIndex|
+---------+--------------------+-------------+
|   bricks|(20,[1,2,4,7,10,1...|          6.0|
|   bricks|(20,[3,5,7,8,9,11...|          6.0|
|   bricks|(20,[2,4,7,8,10,1...|          6.0|
|   bricks|(20,[0,1,3,4,7,9,...|          6.0|
|   bricks|(20,[0,1,2,4,5,10...|          6.0|
+---------+--------------------+-------------+
only showing top 5 rows



Let's continue with the object oriented flow where we had left it, by defining children classes for classification.

### Random Forest

Ensemble method - Uses multiple decision trees. Makes decision by majority voting (Mode, Average)

Defining a child class __randomForest__ which trains a classification model, evaluate performance of model on validation set and predict labels for test set

In [625]:
class randomForest(categoryPrediction):
    
    # This method is used to train classification model using random forest algorithm.
    
    def training(self,df):
        rf = RandomForestClassifier(labelCol="categoryIndex", featuresCol="features", numTrees=10)
        #pipelinerf = Pipeline(stages=[rf, labelConverter])
        rfmodel = rf.fit(df)
        return rfmodel

    # This method is used to evaluate validation set created using method in the parent class. This method takes
    # trained model, validation set as the inputs and returns accuracy ranging from 0 to 1.
    
    def evaluation(self,df,rfmodel):
        predictions1 = rfmodel.transform(df)
        evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions1)
        return accuracy
        
    # This method is used to predict tags based on trained random forest model. It takes trained model and test
    # data as the argument and returns pandas dataframe having single column which contains predicted tags.
    
    def prediction(self,df,rfmodel):
        rfPred=rfmodel.transform(df)
        rfPred=categoryPrediction.labelToCategory(self,rfPred)
        rfres=(rfPred.select('predictedLabel').toPandas())
        rfres.rename(columns={'predictedLabel':'rf_prediction'}, inplace=True)
        return rfres


__Other hyperparameters:__ <br>
Maxdepth <br>
Maxbins 

### Naive Bayes

Assumes that all the features are independant and are having equal significance

Defining a child class __naiveBayes__ which trains a classification model using Naive Bayes algorithm, evaluate performance of model on validation set and predict labels for test set

In [624]:
class naiveBayes(categoryPrediction):
    
    def training(self,df):
        nb = NaiveBayes(smoothing=1.0, modelType="multinomial",labelCol="categoryIndex")
        nbmodel = nb.fit(df)
        return nbmodel

    def evaluation(self,df,nbmodel):
        predictions2 = nbmodel.transform(df)
        evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions2)
        return accuracy
        
    def prediction(self,df,nbmodel):
        nbPred=nbmodel.transform(df)
        nbPred=categoryPrediction.labelToCategory(self,nbPred)
        nbres=(nbPred.select('predictedLabel').toPandas())
        nbres.rename(columns={'predictedLabel':'nb_prediction'}, inplace=True)
        return nbres
    

__Other model types:__ <br>
    Gaussian model - Continuous data <br>
    Bernoulli's model - Binary features

__Smoothing__:
Some Probabilities might be 0, So Laplace's smoothing is used. The number is added to the total count.

### Logistic regression

Gives probability of every class. The class having maximum probability is the selected class

Defining a child class __logisticRegression__ which trains a classification model using logistic regression algorithm, evaluate performance of model on validation set and predict labels for test set

In [623]:
class logisticRegression(categoryPrediction):
    
    def training(self,df):
        lr = LogisticRegression(labelCol="categoryIndex", featuresCol="features", maxIter=20)
        #pipelinerf = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])
        lrmodel = lr.fit(df)
        return lrmodel

    def evaluation(self,df,lrmodel):
        predictions3 = lrmodel.transform(df)
        evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions3)
        return accuracy
        
    def prediction(self,df,lrmodel):
        lrPred=lrmodel.transform(df)
        lrPred=categoryPrediction.labelToCategory(self,lrPred)
        lrres=(lrPred.select('predictedLabel').toPandas())
        lrres.rename(columns={'predictedLabel':'lr_prediction'}, inplace=True)
        return lrres
    

#### Note: Hyperparameter tuning - Gridsearch
Enables to vary parameters and best parameters are suggested based on cross-validation score.

Creating objects for children and parent classes

In [616]:
obj=categoryPrediction()
rfobj=randomForest()
nbobj=naiveBayes()
lrobj=logisticRegression()

Data reading, cleaning and transformation

In [617]:
train=obj.readTrain("/home/shreyas/GeminiData/seed.csv")
testSet=obj.readTest("/home/shreyas/GeminiData/input_data.csv")
cleanTrain=obj.clean(train)
cleantestSet=obj.clean(testSet)
indexedTrain=obj.categoryToLabel(cleanTrain)
trainSet,evalSet=obj.split(indexedTrain)

Working with logistic regression model

In [620]:
lrmodel=lrobj.training(trainSet)
lrAcc=lrobj.evaluation(evalSet,lrmodel)
lrres=lrobj.prediction(cleantestSet,lrmodel)
lrAccTrain=nbobj.evaluation(trainSet,nbmodel)
print('Logistic regression train set Accuracy: ',lrAccTrain*100,'%')
print('Logistic regression Accuracy: ',lrAcc*100,'%')

Logistic regression train set Accuracy:  61.76470588235294 %
Logistic regression Accuracy:  19.35483870967742 %


Working with Naive Bayes model

In [621]:
nbmodel=nbobj.training(trainSet)
nbAcc=nbobj.evaluation(evalSet,nbmodel)
nbres=nbobj.prediction(cleantestSet,nbmodel)
nbAccTrain=nbobj.evaluation(trainSet,nbmodel)
print('Naive Bayes train set Accuracy: ',nbAccTrain*100,'%')
print('Naive Bayes validation set Accuracy: ',nbAcc*100,'%')

Naive Bayes train set Accuracy:  61.76470588235294 %
Naive Bayes validation set Accuracy:  12.903225806451612 %


Working with random forest model

In [622]:
rfmodel=rfobj.training(trainSet)
rfAcc=rfobj.evaluation(evalSet,rfmodel)
rfres=rfobj.prediction(cleantestSet,rfmodel)
rfAccTrain=nbobj.evaluation(trainSet,rfmodel)
print('Random forest train set Accuracy: ',rfAccTrain*100,'%')
print('Random Forest Accuracy: ',rfAcc*100,'%')

Random forest train set Accuracy:  89.70588235294117 %
Random Forest Accuracy:  25.806451612903224 %


All the models are over-fitted and performing very bad due to insufficient data.

In RF, trees are splitted in random subset of features which avoids overfitting and also weak features are improved by multiple decision trees

__Other metrics to evaluate classification algorithms:__
1) Precision <br>
2) Recall <br>
3) Confussion matrix <br>
4) F-1 Score <br>
5) Roc Curve <br>
    

Creating a pandas dataframe which contains text data and predicted tags using all the classification mentioned above

In [553]:
question=testSet.toPandas()
pred=pd.concat([question,rfres,nbres,lrres],axis=1)

Writing dataframe into a csv file

In [571]:
pred.to_csv('Stack Overflow Tag Classification.csv')