# Input Data

In [2]:
inputDF = spark.read.load("/data/students/bigdata_internet/lab4/log_tcp_complete_classes.txt",
                        format="csv",
                        header=True,
                        inferSchema=True,
                        sep=' ')

print("number of columns in the file = ",len(inputDF.columns))
print("Number of connection log = " , inputDF.count())
# inputDF.printSchema()

number of columns in the file =  207
Number of connection log =  100000


# Classify TCP Connection

In [2]:
classesDF = inputDF.select("class:207")
print("Number of classes = ",classesDF.distinct().count())
classesDF.groupBy("class:207").count().show()

Number of classes =  10
+---------------+-----+
|      class:207|count|
+---------------+-----+
|   class:google|10000|
|   class:amazon|10000|
|class:instagram|10000|
| class:facebook|10000|
|  class:netflix|10000|
|     class:ebay|10000|
|  class:spotify|10000|
| class:linkedin|10000|
|  class:youtube|10000|
|     class:bing|10000|
+---------------+-----+



## Split the data 

In [3]:
trainDF,testDF = inputDF.randomSplit([0.75,0.25])
print("Training set size: ",trainDF.count())
print("Testing set size: ",testDF.count())

Training set size:  75072
Testing set size:  24928


## Pre-process the dataset

In [4]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoderEstimator
# transform the labels to indexs
labelIndexer = StringIndexer(inputCol="class:207", outputCol="service",handleInvalid="keep")
labelIndexerModel = labelIndexer.fit(trainDF)
processedTrainDF=labelIndexerModel.transform(trainDF)
processedTestDF = labelIndexerModel.transform(testDF)

#trans form categorical features to indexs
feature1Indexer = StringIndexer(inputCol="con_t:42", outputCol="connectionType",handleInvalid="keep")
feature1IndexerModel = feature1Indexer.fit(processedTrainDF)
processedTrainDF=feature1IndexerModel.transform(processedTrainDF)
processedTestDF=feature1IndexerModel.transform(processedTestDF)
feature2Indexer = StringIndexer(inputCol="http_t:44", outputCol="httpType",handleInvalid="keep")
feature2IndexerModel = feature2Indexer.fit(processedTrainDF)
processedTrainDF=feature2IndexerModel.transform(processedTrainDF)
processedTestDF=feature2IndexerModel.transform(processedTestDF)

va=VectorAssembler(inputCols=["s_bytes_uniq:21","durat:31","connectionType","httpType","c_rtt_avg:45","s_rtt_avg:52","s_pkts_data_avg:197",
                             "c_pkts_push:114","s_pkts_push:115","c_msgsize_count:131","s_msgsize_count:142","c_pkts_data_std:195",
                             "s_pkts_data_std:198","c_sit_std:201","s_sit_std:204"],
                    outputCol="features")

processedTrainDF=va.transform(processedTrainDF)
processedTestDF=va.transform(processedTestDF)

## Train at least two different models

### Train the random forest classifier

In [5]:
from pyspark.ml.classification import RandomForestClassifier
import timeit
rf = RandomForestClassifier(labelCol="service",
                            featuresCol="features",numTrees=20)

start = timeit.default_timer()
rfModel=rf.fit(processedTrainDF)
stop = timeit.default_timer()
print('Random forest training time(s): ', stop - start)  

rfFinalTrainDF=rfModel.transform(processedTrainDF)
rfFinalTestDF=rfModel.transform(processedTestDF)

Random forest training time(s):  4.722044946043752


### Train the decision tree classifier

In [12]:
from pyspark.ml.classification import DecisionTreeClassifier
# Train a DecisionTree model
dt = DecisionTreeClassifier(labelCol="service",featuresCol="features")
start = timeit.default_timer()
dtModel=dt.fit(processedTrainDF)
stop = timeit.default_timer()
print('Random forest training time(s): ', stop - start)  

dtFinalTrainDF=dtModel.transform(processedTrainDF)
dtFinalTestDF=dtModel.transform(processedTestDF)

Random forest training time(s):  2.5945384330116212


## Evaluate the performance of the models

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
myEvaluator1 =MulticlassClassificationEvaluator(labelCol="service",predictionCol="prediction",metricName='accuracy')
myEvaluator2 =MulticlassClassificationEvaluator(labelCol="service",predictionCol="prediction",metricName='f1')
print("Evalutation of random forest:")
print("Accuracy on training is ", myEvaluator1.evaluate(rfFinalTrainDF))
print("F1 measure on training is ", myEvaluator2.evaluate(rfFinalTrainDF))
print("Accuracy on test is ", myEvaluator1.evaluate(rfFinalTestDF))
print("F1 measure on test is ", myEvaluator2.evaluate(rfFinalTestDF))
print("Evaluation of decision tree")
print("Accuracy on training is ", myEvaluator1.evaluate(dtFinalTrainDF))
print("F1 measure on training is ", myEvaluator2.evaluate(dtFinalTrainDF))
print("Accuracy on test is ", myEvaluator1.evaluate(dtFinalTestDF))
print("F1 measure on test is ", myEvaluator2.evaluate(dtFinalTestDF))

Evalutation of random forest:
Accuracy on training is  0.6558637041773231
F1 measure on training is  0.6541723859860844
Accuracy on test is  0.6573732349165597
F1 measure on test is  0.6557234069448274
Evaluation of decision tree
Accuracy on training is  0.558490515771526
F1 measure on training is  0.5540048518808488
Accuracy on test is  0.5568838254172015
F1 measure on test is  0.5528345011507971


## Tune the parameters of the models

### Tuning of random forest

In [9]:
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
import numpy
rf = RandomForestClassifier(labelCol="service",featuresCol="features")
paramGrid = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [10,2,20]) \
    .addGrid(rf.impurity, ["Gini","Entropy"])\
    .addGrid(rf.minInstancesPerNode,[5,5,30])\
    .addGrid(rf.numTrees, [1,10,30])\
    .build()
myEvaluator =MulticlassClassificationEvaluator(labelCol="service",predictionCol="prediction",
                                               metricName="accuracy")
cv=CrossValidator(estimator=rf,evaluator=myEvaluator,estimatorParamMaps=paramGrid, numFolds=3)
cvModel=cv.fit(processedTrainDF)
rfFinalTrainDF=cvModel.transform(processedTrainDF)
rfFinalTestDF=cvModel.transform(processedTestDF)
cvModel.getEstimatorParamMaps()[numpy.argmax(cvModel.avgMetrics)]

{Param(parent='RandomForestClassifier_6796dcbc3599', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 20,
 Param(parent='RandomForestClassifier_6796dcbc3599', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'Entropy',
 Param(parent='RandomForestClassifier_6796dcbc3599', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 5,
 Param(parent='RandomForestClassifier_6796dcbc3599', name='numTrees', doc='Number of trees to train (>= 1).'): 30}

### Tuning of decision tree

In [21]:
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
import numpy
dt = DecisionTreeClassifier(labelCol="service",featuresCol="features")
paramGriddt = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [10,15,20]) \
    .addGrid(rf.impurity, ["Gini","Entropy"])\
    .addGrid(rf.minInstancesPerNode,[5,8,10])\
    .build()
myEvaluator =MulticlassClassificationEvaluator(labelCol="service",predictionCol="prediction",
                                               metricName="accuracy")
cv2=CrossValidator(estimator=dt,evaluator=myEvaluator,estimatorParamMaps=paramGriddt, numFolds=3)
cv2Model=cv2.fit(processedTrainDF)
dtFinalTrainDF=cv2Model.transform(processedTrainDF)
dtFinalTestDF=cv2Model.transform(processedTestDF)
cv2Model.getEstimatorParamMaps()[numpy.argmax(cv2Model.avgMetrics)]

{Param(parent='RandomForestClassifier_cd916a51ea2f', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,
 Param(parent='RandomForestClassifier_cd916a51ea2f', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'Gini',
 Param(parent='RandomForestClassifier_cd916a51ea2f', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 5}

## Return the best possible model and estimate its performance on new data

### Performance of the besst possible model of random forest classifier 

In [10]:
rfOptimal = RandomForestClassifier(labelCol="service",featuresCol="features",numTrees=30,maxDepth=20,
                                   impurity="Entropy",minInstancesPerNode=5)
rfOptimalModel =rfOptimal.fit(processedTrainDF)
rfFinalTrainDF=rfOptimalModel.transform(processedTrainDF)
rfFinalTestDF=rfOptimalModel.transform(processedTestDF)
print("Evalutation of random forest:")
print("Accuracy on training is ", myEvaluator1.evaluate(rfFinalTrainDF))
print("F1 measure on training is ", myEvaluator2.evaluate(rfFinalTrainDF))
print("Accuracy on test is ", myEvaluator1.evaluate(rfFinalTestDF))
print("F1 measure on test is ", myEvaluator2.evaluate(rfFinalTestDF))

Evalutation of random forest:
Accuracy on training is  0.9499712708954127
F1 measure on training is  0.9499347149912925
Accuracy on test is  0.9126097842069706
F1 measure on test is  0.9125492365870005


### Performance of the besst possible model of decision tree classifier 

In [22]:
dtOptimal = DecisionTreeClassifier(labelCol="service",featuresCol="features",maxDepth=10,
                                   impurity="Gini",minInstancesPerNode=5)
dtOptimalModel =dtOptimal.fit(processedTrainDF)
dtFinalTrainDF=dtOptimalModel.transform(processedTrainDF)
dtFinalTestDF=dtOptimalModel.transform(processedTestDF)
print("Evalutation of neural network:")
print("Accuracy on training is ", myEvaluator1.evaluate(dtFinalTrainDF))
print("F1 measure on training is ", myEvaluator2.evaluate(dtFinalTrainDF))
print("Accuracy on test is ", myEvaluator1.evaluate(dtFinalTestDF))
print("F1 measure on test is ", myEvaluator2.evaluate(dtFinalTestDF))

Evalutation of neural network:
Accuracy on training is  0.8009111253196931
F1 measure on training is  0.8026785345361586
Accuracy on test is  0.7901957637997432
F1 measure on test is  0.7922922855780302


# Cluster users(Bonus Task)

In [24]:
ipAllDF = inputDF.select("#31#c_ip:1")
clientNum = ipAllDF.distinct().count()
print("Answer to question: ")
print("Total number of clients in the file is: ",clientNum)
print("Average number of connections: ",ipAllDF.count()/clientNum)

Answer to question: 
Total number of clients in the file is:  3844
Average number of connections:  26.014568158168576


In [60]:
# compute statics of features
TCPNumDF = inputDF.selectExpr("`#31#c_ip:1` AS key").groupBy("key").count()
sumBytesDF = inputDF.selectExpr("`#31#c_ip:1` AS key" ,"`c_bytes_all:9`AS c_bytes_all","`s_bytes_all:23` AS s_bytes_all","`s_bytes_retx:25` AS s_bytes_retx")\
            .groupBy("key").sum("c_bytes_all","s_bytes_all","s_bytes_retx")
avgBytesDF = inputDF.selectExpr("`#31#c_ip:1` AS key","`s_rtt_avg:52` AS s_rtt_avg","`s_first:33` AS s_first").groupBy("key")\
                .avg("s_rtt_avg","s_first") 
featuresByIPDF = TCPNumDF.join(sumBytesDF,"key").join(avgBytesDF,"key")     
featuresByIPDF.show(5)

+---------------+-----+----------------+----------------+-----------------+------------------+------------------+
|            key|count|sum(c_bytes_all)|sum(s_bytes_all)|sum(s_bytes_retx)|    avg(s_rtt_avg)|      avg(s_first)|
+---------------+-----+----------------+----------------+-----------------+------------------+------------------+
|  156.60.18.189|   95|           90794|          561767|            18017| 68.93080024210525|111.40607368421053|
|   246.25.87.44|    1|            1996|            6395|                0|         29.023549|            47.214|
|254.222.227.249|   20|          100315|           93481|                5| 31.46155785000001|         727.81405|
|180.102.208.155|   36|          435819|        98356251|           468531|20.410008222222224| 135.3493611111111|
|  180.102.5.237|   81|          136791|         1153915|            66095| 75.14003027160494|246.04407407407408|
+---------------+-----+----------------+----------------+-----------------+-------------

In [61]:
# prerpocess the data
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
va=VectorAssembler(inputCols=["count","sum(c_bytes_all)","sum(s_bytes_all)","sum(s_bytes_retx)",
                              "avg(s_rtt_avg)","avg(s_first)"],
                    outputCol="features")
assembledDF=va.transform(featuresByIPDF)
scaler = StandardScaler(inputCol="features",
                    outputCol="scaledFeatures", withStd=True, withMean=True)
scalerModel = scaler.fit(assembledDF)
scaledDF=scalerModel.transform(assembledDF)
scaledDF.show(5)

+---------------+-----+----------------+----------------+-----------------+------------------+------------------+--------------------+--------------------+
|            key|count|sum(c_bytes_all)|sum(s_bytes_all)|sum(s_bytes_retx)|    avg(s_rtt_avg)|      avg(s_first)|            features|      scaledFeatures|
+---------------+-----+----------------+----------------+-----------------+------------------+------------------+--------------------+--------------------+
|  156.60.18.189|   95|           90794|          561767|            18017| 68.93080024210525|111.40607368421053|[95.0,90794.0,561...|[1.49221510349479...|
|   246.25.87.44|    1|            1996|            6395|                0|         29.023549|            47.214|[1.0,1996.0,6395....|[-0.5410869427997...|
|254.222.227.249|   20|          100315|           93481|                5| 31.46155785000001|         727.81405|[20.0,100315.0,93...|[-0.1301003589742...|
|180.102.208.155|   36|          435819|        98356251|       

In [66]:
# Cluster with K-means and Evaluate
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
kmeans = KMeans(k=10,featuresCol="scaledFeatures",initMode="k-means||")
model = kmeans.fit(scaledDF)
predictionsDF = model.transform(scaledDF)

centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
print("Size of the clusters: ", model.summary.clusterSizes)
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictionsDF)
print("Silhouette with squared euclidean distance = " + str(silhouette))
print("SSE: ",model.computeCost(predictionsDF))

Cluster Centers: 
[-0.30493536 -0.03539213 -0.13581139 -0.12737719 -0.09785649 -0.13895952]
[ 5.92068567  0.17778297  0.1846758   0.30958307 -0.02271687  0.16099715]
[-0.0840172  -0.03619209 -0.14202217 -0.07738945  0.15869617  6.325802  ]
[-0.2598856  -0.04104792 -0.15716115 -0.15292926 37.53320366  5.64464997]
[ 1.27590638  0.096142   21.75941001  8.81227512 -0.17932665 -0.20520205]
[ 1.55092747  0.1442749   8.76731777 18.71750646  0.06336384  0.18616852]
[ 0.08491052  0.13849213 -0.09556905  0.01399882  4.74066284  1.02361115]
[ 1.08973903  0.0581222   4.23515868  2.73132747  0.03676071 -0.05599182]
[1.09326835 0.17180427 0.07242403 0.11079759 0.03765219 0.09164826]
[-0.51945607 -0.04294102 -0.16679048 -0.15299766  0.18195395 28.9720216 ]
Size of the clusters:  [3101, 48, 46, 1, 3, 7, 47, 61, 529, 1]
Silhouette with squared euclidean distance = -0.9327856315897602
SSE:  8621.659083725388


In [68]:
# Cluster with GMM and Evaluate
from pyspark.ml.clustering import GaussianMixture
# Trains a GMM model.
gmm = GaussianMixture(k=10,featuresCol="scaledFeatures")
modelGMM = gmm.fit(scaledDF)
# Make predictions
predictionsDFGMM = modelGMM.transform(scaledDF)

print("Size of the clusters: ", modelGMM.summary.clusterSizes)
# Evaluate clustering by computing Silhouette score
evaluatorGMM = ClusteringEvaluator()
silhouette = evaluatorGMM.evaluate(predictionsDFGMM)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Size of the clusters:  [48, 11, 370, 431, 1663, 77, 16, 135, 986, 107]
Silhouette with squared euclidean distance = 0.13511967145979883
