In [None]:
#NAIVE BAYES

In [26]:
from pyspark import SparkContext
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.regression import LabeledPoint

def labelData(line):
        probability = [float(row) for row in line.split(' ')]
        return LabeledPoint(probability[0], probability[1:])

if __name__ == "__main__":

    #sc = SparkContext(appName="Naive-Bayes")

    # Load and parse the data file.
    NYT_data = sc.textFile("Desktop/SHEF/known_cleaned.txt")
    labeledData = NYT_data.map(labelData)
    NYT_data_unknown = sc.textFile("Desktop/SHEF/unknown_cleaned.txt")
    unknown_labeledData = NYT_data_unknown.map(labelData)

    # Dividing data to training and test data with a ration of 8/2.
    training, test = labeledData.randomSplit([0.6, 0.4])

    naive_model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    prediction = test.map(lambda a: (naive_model.predict(a.features), a.label))
    efficiency = 1.0 * prediction.filter(lambda b: b[0] == b[1]).count() / test.count()
    error = 1 - efficiency
    accuracy_percent = efficiency*100
    error_percent = error*100
    print('Accuracy on Known Data {}'.format(accuracy_percent))
    
    prediction = unknown_labeledData.map(lambda a: (naive_model.predict(a.features), a.label))
    efficiency = 1.0 * prediction.filter(lambda b: b[0] == b[1]).count() / unknown_labeledData.count()
    error = 1 - efficiency
    accuracy_percent = efficiency*100
    error_percent = error*100
    print('Accuracy {} on Unknown Data'.format(accuracy_percent))

Accuracy on Known Data 40.666666666666664
Accuracy 40.909090909090914 on Unknown Data


In [18]:
# RANDOM FOREST

In [28]:
from __future__ import print_function

from pyspark import SparkContext
from pyspark.mllib.tree import RandomForest, RandomForestModel

def labelData(line):
        probability = [float(row) for row in line.split(' ')]
        return LabeledPoint(probability[0], probability[1:])

if __name__ == "__main__":
    #sc = SparkContext(appName="PythonRandomForestClassificationExample")
   
    NYT_data = sc.textFile("Desktop/SHEF/known_cleaned.txt")
    labeledData = NYT_data.map(labelData)
    NYT_data_unknown = sc.textFile("Desktop/SHEF/unknown_cleaned.txt")
    unknown_labeledData = NYT_data_unknown.map(labelData)
    
    training, test = labeledData.randomSplit([0.6, 0.4])
    
    # Train a RandomForest model.
    RandomForestmodel = RandomForest.trainClassifier(training, numClasses=4, categoricalFeaturesInfo={},
                                         numTrees=1, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=2, maxBins=25)

    # Evaluate model on test instances and compute test error
    prediction = RandomForestmodel.predict(test.map(lambda x: x.features))
    labelsAndPredictions = test.map(lambda lp: lp.label).zip(prediction)
    #print(labelsAndPredictions.collect())
    Error = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(test.count())
    print('Error on Known data = ' + str(Error))
    
    prediction = RandomForestmodel.predict(unknown_labeledData.map(lambda x: x.features))
    labelsAndPredictions = unknown_labeledData.map(lambda lp: lp.label).zip(prediction)
    #print(labelsAndPredictions.collect())
    Error = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(unknown_labeledData.count())
    print('Error on unknown data = ' + str(Error))
    
    
    


Error on Known data = 0.3105022831050228
Error on unknown data = 0.31105169340463457


In [None]:
# LOGISTIC REGRESSION

In [30]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
# $example off$

from pyspark import SparkContext

def labelData(line):
        probability = [float(row) for row in line.split(' ')]
        return LabeledPoint(probability[0], probability[1:])

if __name__ == "__main__":
    #sc = SparkContext(appName="MultiClassMetricsExample")

    # Several of the methods available in scala are currently missing from pyspark
    # $example on$
    # Load training data in LIBSVM format
    NYT_data = sc.textFile("Desktop/SHEF/known_cleaned.txt")
    labeledData = NYT_data.map(labelData)
    NYT_data_unknown = sc.textFile("Desktop/SHEF/unknown_cleaned.txt")
    unknown_labeledData = NYT_data_unknown.map(labelData)
    
    training, test = labeledData.randomSplit([0.6, 0.4])
    
    training.cache()
    # Run training algorithm to build the model
    LogisticRegmodel = LogisticRegressionWithLBFGS.train(training, numClasses=4)

    # Compute raw scores on the test set
    predictions = test.map(lambda lp: (float(LogisticRegmodel.predict(lp.features)), lp.label))
    
    predictions_unknown = unknown_labeledData.map(lambda lp: (float(LogisticRegmodel.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictions)
    metrics2 = MulticlassMetrics(predictions_unknown)

    # Overall statistics
    accuracy = metrics.precision()
    accuracy_unknown = metrics2.precision()
    print("Accuracy on known data = %s" % accuracy)
    print("Accuracy on unknown data = %s" % accuracy_unknown)
    

Accuracy on known data = 0.8611111111111112
Accuracy on unknown data = 0.946524064171123


In [None]:
# KMEANS

In [31]:
from __future__ import print_function

from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans

def ConvertToVector(line):
    return np.array([float(x) for x in line.split(' ')])

if __name__ == "__main__":

    #sc = SparkContext(appName="PythonLogisticRegressionWithLBFGSExample")
    data = sc.textFile("Desktop/SHEF/known_cleaned.txt")
    data_unknown = sc.textFile("Desktop/SHEF/unknown_cleaned.txt")
    
    convertedVectorData = data.map(ConvertToVector)
    convertedVectorData_unknown = data_unknown.map(ConvertToVector)
    
    Kmeansmodel = KMeans.train(convertedVectorData, 4)
    Kmeansmodel_unknown = KMeans.train(convertedVectorData_unknown, 4)
    
    print("Accuracy on Known data: " + str(Kmeansmodel.computeCost(convertedVectorData)))
    print("Accuracy on unKnown data: " + str(Kmeansmodel.computeCost(convertedVectorData_unknown)))
    

Accuracy on Known data: 0.8962966822359193
Accuracy on unKnown data: 0.8962966822359193


In [None]:
# DECISION TREE

In [32]:
from __future__ import print_function

from pyspark import SparkContext
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

def labelData(line):
        probability = [float(row) for row in line.split(' ')]
        return LabeledPoint(probability[0], probability[1:])

if __name__ == "__main__":

    #sc = SparkContext(appName="PythonDecisionTreeRegressionExample")

    NYT_data = sc.textFile("Desktop/SHEF/known_cleaned.txt")
    labeledData = NYT_data.map(labelData)
    NYT_data_unknown = sc.textFile("Desktop/SHEF/unknown_cleaned.txt")
    unknown_labeledData = NYT_data_unknown.map(labelData)
    training, test = labeledData.randomSplit([0.6, 0.4])    

    # Train a DecisionTree model.
    DecisionTreemodel = DecisionTree.trainRegressor(training, categoricalFeaturesInfo={},
                                        impurity='variance', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = DecisionTreemodel.predict(test.map(lambda x: x.features))
    labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(test.count())
    print('Mean Squared Error on known data = ' + str(testMSE))
    
    predictions = DecisionTreemodel.predict(unknown_labeledData.map(lambda x: x.features))
    labelsAndPredictions = unknown_labeledData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(unknown_labeledData.count())
    print('Mean Squared Error on unknown data = ' + str(testMSE))
    
    

Mean Squared Error on known data = 0.5691367631349948
Mean Squared Error on unknown data = 0.31517259185832314
