In [137]:
import findspark
import pandas as pd
findspark.init("/home/vbhamidipati1/spark")
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import lit
from pyspark import SparkConf, SparkContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, CountVectorizer
from pyspark.ml import Pipeline
from sklearn.model_selection import train_test_split
from pyspark.ml.classification import NaiveBayes

In [6]:
spark = SparkSession \
        .builder \
        .appName("Machine Learning on Analysed YouTube Comments") \
        .getOrCreate()

In [7]:
schema = StructType([
    StructField("video_id", StringType(), True),
    StructField("positive_comments", IntegerType(), True),
    StructField("neutral_comments", IntegerType(), True),
    StructField("negative_comments", IntegerType(), True),
    StructField("total_views", IntegerType(), True),
    StructField("total_likes", IntegerType(), True),
    StructField("total_dislikes", IntegerType(), True)
])

In [8]:
# cols = ['Col1','Col2','Col1','Col2','Col1','Col2','Col1']
sentimentsDF = spark.read.option("delimiter", ",").load('/home/vbhamidipati1/spark/workspace/Project_YouTube_Sentiment_Analysis/Data/sentiment_scores/part-00000-a909e545-1199-4831-a5a9-6c2875ff8fc5-c000.csv', 
                      format='com.databricks.spark.csv', 
                      header=False, 
                      schema = schema,
                      inferSchema='true')

In [9]:
sentimentsDF.show()

+-----------+-----------------+----------------+-----------------+-----------+-----------+--------------+
|   video_id|positive_comments|neutral_comments|negative_comments|total_views|total_likes|total_dislikes|
+-----------+-----------------+----------------+-----------------+-----------+-----------+--------------+
|xPS7bqBePSs|              177|             117|                6|      76775|       5682|            37|
|dInwVhRtN4E|              171|             330|              198|   15660374|     392718|         17901|
|rn5Xgak1zzA|               67|              69|               64|     529514|      40548|           952|
|TzyraAp3jaY|               85|             139|               26|     228909|       2644|             4|
|eHq6ZA6uKOg|               63|              71|               37|     135189|       1156|           374|
|_r5eTelhpmQ|              160|             114|               15|     182342|       7712|           191|
|JkqTeQHFoBY|               64|              5

In [10]:
sentiments_polarity = sentimentsDF.withColumn("pol", lit(0))
sentiments_polarity.show()

+-----------+-----------------+----------------+-----------------+-----------+-----------+--------------+---+
|   video_id|positive_comments|neutral_comments|negative_comments|total_views|total_likes|total_dislikes|pol|
+-----------+-----------------+----------------+-----------------+-----------+-----------+--------------+---+
|xPS7bqBePSs|              177|             117|                6|      76775|       5682|            37|  0|
|dInwVhRtN4E|              171|             330|              198|   15660374|     392718|         17901|  0|
|rn5Xgak1zzA|               67|              69|               64|     529514|      40548|           952|  0|
|TzyraAp3jaY|               85|             139|               26|     228909|       2644|             4|  0|
|eHq6ZA6uKOg|               63|              71|               37|     135189|       1156|           374|  0|
|_r5eTelhpmQ|              160|             114|               15|     182342|       7712|           191|  0|
|JkqTeQHFo

In [11]:
def polarityDetermination(video_id, positive_comments, neutral_comments, negative_comments, total_views, total_likes, total_dislikes, pol):
    if positive_comments == max(positive_comments, neutral_comments, negative_comments):
        pol = 1
    elif neutral_comments == max(positive_comments, neutral_comments, negative_comments):
        pol = 0
    else:
        pol = -1
    yield (video_id, positive_comments, neutral_comments, negative_comments, total_views, total_likes, total_dislikes, pol)
        

In [12]:
# for x, y, z in zip(sentiments_with_polarity.positive_comments, sentiments_with_polarity.neutral_comments, sentiments_with_polarity.negative_comments):
#     if x == max(x, y, z):
#         sentiments_with_polarity['pol'] = 1
#     elif y == max(x, y, z):
#         sentiments_with_polarity['pol'] = 0
#     else:
#         sentiments_with_polarity['pol'] = -1
        
# sentiments_with_polarity.show()
sentiments_with_polarity = sentiments_polarity.rdd.flatMap(lambda row: 
                                                                polarityDetermination(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))


In [13]:
sentiments_with_polarity.collect()

[('xPS7bqBePSs', 177, 117, 6, 76775, 5682, 37, 1),
 ('dInwVhRtN4E', 171, 330, 198, 15660374, 392718, 17901, 0),
 ('rn5Xgak1zzA', 67, 69, 64, 529514, 40548, 952, 0),
 ('TzyraAp3jaY', 85, 139, 26, 228909, 2644, 4, 0),
 ('eHq6ZA6uKOg', 63, 71, 37, 135189, 1156, 374, 0),
 ('_r5eTelhpmQ', 160, 114, 15, 182342, 7712, 191, 1),
 ('JkqTeQHFoBY', 64, 50, 20, 431082, 4694, 43, 1),
 ('Bo-qp-Zu0OY', 11, 10, 1, 21654, 179, 1, 1),
 ('K7pQsR8WFSo', 181, 161, 58, 2300663, 110553, 1060, 1),
 ('g_ekn1gjBq0', 25, 5, 1, 16427, 395, 17, 1),
 ('4yCkkOvIkUI', 6, 11, 2, 12905, 59, 43, 0),
 ('7TN09IP5JuI', 207, 253, 40, 20785005, 869092, 8855, 0),
 ('RE-far-FvRs', 200, 235, 65, 1355210, 63437, 1225, 0),
 ('WQjO1mMCPg4', 107, 322, 71, 10417128, 307533, 29960, 0),
 ('aRgTLb5EbiQ', 337, 51, 12, 72810, 9292, 48, 1),
 ('xNddRhpx5tA', 96, 265, 139, 4393208, 109168, 2060, 0),
 ('tUXLO8Dtvq4', 47, 83, 70, 688657, 13542, 1104, 0),
 ('a7Sf_H2cFdM', 126, 155, 18, 3823167, 104584, 2979, 0),
 ('mGqR9sgMIyA', 5, 13, 7, 25652

In [14]:
polarity_scores_DF = sentiments_with_polarity.toDF(schema=['video_id', 'positive_comments', 'neutral_comments', 'negative_comments', 'total_views', 'total_likes', 'total_dislikes', 'polarity'])

In [15]:
polarity_scores_DF.show()

+-----------+-----------------+----------------+-----------------+-----------+-----------+--------------+--------+
|   video_id|positive_comments|neutral_comments|negative_comments|total_views|total_likes|total_dislikes|polarity|
+-----------+-----------------+----------------+-----------------+-----------+-----------+--------------+--------+
|xPS7bqBePSs|              177|             117|                6|      76775|       5682|            37|       1|
|dInwVhRtN4E|              171|             330|              198|   15660374|     392718|         17901|       0|
|rn5Xgak1zzA|               67|              69|               64|     529514|      40548|           952|       0|
|TzyraAp3jaY|               85|             139|               26|     228909|       2644|             4|       0|
|eHq6ZA6uKOg|               63|              71|               37|     135189|       1156|           374|       0|
|_r5eTelhpmQ|              160|             114|               15|     182342|  

In [16]:
polarity_scores_DF.toPandas()

Unnamed: 0,video_id,positive_comments,neutral_comments,negative_comments,total_views,total_likes,total_dislikes,polarity
0,xPS7bqBePSs,177,117,6,76775,5682,37,1
1,dInwVhRtN4E,171,330,198,15660374,392718,17901,0
2,rn5Xgak1zzA,67,69,64,529514,40548,952,0
3,TzyraAp3jaY,85,139,26,228909,2644,4,0
4,eHq6ZA6uKOg,63,71,37,135189,1156,374,0
...,...,...,...,...,...,...,...,...
2261,yDDDJC3AYtA,112,143,45,2508375,133946,2202,0
2262,evdxH50J3rs,60,83,17,190062,2957,114,0
2263,Y61Q5w8qdSw,111,111,78,2863622,221270,30772,1
2264,TV-_Yuc228s,93,86,21,1727464,179497,963,1


In [33]:
train_df, test_df = train_test_split(polarity_scores_DF.toPandas(), test_size=0.2)

In [34]:
train_df

Unnamed: 0,video_id,positive_comments,neutral_comments,negative_comments,total_views,total_likes,total_dislikes,polarity
2030,nFxUU519P7U,35,114,51,395189,8701,798,0
2234,dE9-DgGYZRU,109,407,84,10918069,336625,13245,0
612,k9VmfRBkUuM,72,157,27,390087,2144,318,0
1586,vOkzPTAZ8-E,100,310,90,8203797,201843,38456,0
1147,cXQUObfBXYk,94,92,14,676590,24781,276,1
...,...,...,...,...,...,...,...,...
1825,hOFRbjjjwCE,97,215,88,12246688,307694,18539,0
1557,GuEQtn2nm9A,122,116,62,246984,7631,528,1
371,3VSa-oARk-w,4,14,1,24075,189,2,0
1114,AH0f3hesKjI,99,155,46,418836,42041,559,0


In [24]:
categorical_variables = ['positive_comments', 'neutral_comments', 'negative_comments', 'total_views', 'total_likes', 'total_dislikes', 'polarity']
indexers = [StringIndexer(inputCol=column, outputCol=column+"-index") for column in categorical_variables]

In [26]:
encoder = OneHotEncoder(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers]
)

In [28]:
assembler = VectorAssembler(
    inputCols=encoder.getOutputCols(),
    outputCol="categorical-features"
)

In [37]:
train_df = spark.createDataFrame(train_df)
test_df = spark.createDataFrame(test_df)

In [38]:
pipeline = Pipeline(stages=indexers + [encoder, assembler])
train_df = pipeline.fit(train_df).transform(train_df)
test_df = pipeline.fit(test_df).transform(test_df)

In [47]:
train_df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- positive_comments: long (nullable = true)
 |-- neutral_comments: long (nullable = true)
 |-- negative_comments: long (nullable = true)
 |-- total_views: long (nullable = true)
 |-- total_likes: long (nullable = true)
 |-- total_dislikes: long (nullable = true)
 |-- polarity: long (nullable = true)
 |-- positive_comments-index: double (nullable = false)
 |-- neutral_comments-index: double (nullable = false)
 |-- negative_comments-index: double (nullable = false)
 |-- total_views-index: double (nullable = false)
 |-- total_likes-index: double (nullable = false)
 |-- total_dislikes-index: double (nullable = false)
 |-- polarity-index: double (nullable = false)
 |-- polarity-index-encoded: vector (nullable = true)
 |-- total_views-index-encoded: vector (nullable = true)
 |-- total_likes-index-encoded: vector (nullable = true)
 |-- neutral_comments-index-encoded: vector (nullable = true)
 |-- negative_comments-index-encoded: vector (nullable

In [49]:
training_data = train_df.toPandas()
training_data['categorical-features'][1]

SparseVector(5689, {122: 1.0, 642: 1.0, 794: 1.0, 941: 1.0, 3613: 1.0, 4798: 1.0, 5687: 1.0})

In [53]:
indexer = StringIndexer(inputCol='polarity', outputCol='pred')
train_df = indexer.fit(train_df).transform(train_df)
test_df = indexer.fit(test_df).transform(test_df)
train_df.toPandas()['pred']

0       0.0
1       0.0
2       0.0
3       0.0
4       1.0
       ... 
1807    0.0
1808    1.0
1809    0.0
1810    0.0
1811    1.0
Name: pred, Length: 1812, dtype: float64

In [90]:
train_df = train_df.dropna()
lr = LogisticRegression(featuresCol='categorical-features', labelCol='pred')
model = lr.fit(train_df)

In [100]:
train_df.toPandas()

Unnamed: 0,video_id,positive_comments,neutral_comments,negative_comments,total_views,total_likes,total_dislikes,polarity,positive_comments-index,neutral_comments-index,...,polarity-index-encoded,total_views-index-encoded,total_likes-index-encoded,neutral_comments-index-encoded,negative_comments-index-encoded,positive_comments-index-encoded,total_dislikes-index-encoded,categorical-features,label,pred
0,nFxUU519P7U,35,114,51,395189,8701,798,0,34.0,7.0,...,"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1440.0,0.0
1,dE9-DgGYZRU,109,407,84,10918069,336625,13245,0,122.0,311.0,...,"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1174.0,0.0
2,k9VmfRBkUuM,72,157,27,390087,2144,318,0,41.0,188.0,...,"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1353.0,0.0
3,vOkzPTAZ8-E,100,310,90,8203797,201843,38456,0,146.0,207.0,...,"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1684.0,0.0
4,cXQUObfBXYk,94,92,14,676590,24781,276,1,184.0,139.0,...,"(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1153.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,hOFRbjjjwCE,97,215,88,12246688,307694,18539,0,25.0,120.0,...,"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1281.0,0.0
1808,GuEQtn2nm9A,122,116,62,246984,7631,528,1,68.0,40.0,...,"(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",513.0,1.0
1809,3VSa-oARk-w,4,14,1,24075,189,2,0,1.0,78.0,...,"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",120.0,0.0
1810,AH0f3hesKjI,99,155,46,418836,42041,559,0,121.0,41.0,...,"(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",326.0,0.0


In [105]:
pred = model.transform(train_df)

In [106]:
pred

DataFrame[video_id: string, positive_comments: bigint, neutral_comments: bigint, negative_comments: bigint, total_views: bigint, total_likes: bigint, total_dislikes: bigint, polarity: bigint, positive_comments-index: double, neutral_comments-index: double, negative_comments-index: double, total_views-index: double, total_likes-index: double, total_dislikes-index: double, polarity-index: double, polarity-index-encoded: vector, total_views-index-encoded: vector, total_likes-index-encoded: vector, neutral_comments-index-encoded: vector, negative_comments-index-encoded: vector, positive_comments-index-encoded: vector, total_dislikes-index-encoded: vector, categorical-features: vector, label: double, pred: double, rawPrediction: vector, probability: vector, prediction: double]

In [110]:
pred = pred.toPandas()[['video_id', 'polarity', 'prediction']]

In [129]:
pred


Unnamed: 0,video_id,polarity,prediction
0,nFxUU519P7U,0,0.0
1,dE9-DgGYZRU,0,0.0
2,k9VmfRBkUuM,0,0.0
3,vOkzPTAZ8-E,0,0.0
4,cXQUObfBXYk,1,1.0
...,...,...,...
1807,hOFRbjjjwCE,0,0.0
1808,GuEQtn2nm9A,1,1.0
1809,3VSa-oARk-w,0,0.0
1810,AH0f3hesKjI,0,0.0


In [114]:
count = 0
for polarity, prediction in zip(pred['polarity'], pred['prediction']):
    if polarity == int(prediction):
        count += 1

In [143]:
count

1772

In [145]:
print("The Accuracy of training Logistic regression Model is: ", (count/ len(polarity_scores_DF.toPandas()) * 100), "%")

The Accuracy of training Logistic regression Model is:  78.19947043248014 %
