In [205]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col, to_timestamp, udf
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pymongo import MongoClient 
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import LSTM, Dense
from statsmodels.tsa.arima.model import ARIMA
import time
import nltk


In [206]:
spark = SparkSession.builder \
    .appName("Assignment") \
    .getOrCreate()

In [207]:
schema = StructType([
    StructField("serial_no", LongType(), True),
    StructField("ids", LongType(), True),
    StructField("date", StringType(), True),
    StructField("flag", StringType(), True),
    StructField("user", StringType(), True),
    StructField("text", StringType(), True)
])

In [208]:
dataset = "ProjectTweets.csv"
df = spark.read.csv(dataset, schema=schema, header=False)

In [209]:
df.show()

+---------+----------+--------------------+--------+---------------+--------------------+
|serial_no|       ids|                date|    flag|           user|                text|
+---------+----------+--------------------+--------+---------------+--------------------+
|        0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|        1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|        2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|        3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|        4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|        5|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|        6|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|        7|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|        8

In [210]:
df.head(5)

[Row(serial_no=0, ids=1467810369, date='Mon Apr 06 22:19:45 PDT 2009', flag='NO_QUERY', user='_TheSpecialOne_', text="@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"),
 Row(serial_no=1, ids=1467810672, date='Mon Apr 06 22:19:49 PDT 2009', flag='NO_QUERY', user='scotthamilton', text="is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"),
 Row(serial_no=2, ids=1467810917, date='Mon Apr 06 22:19:53 PDT 2009', flag='NO_QUERY', user='mattycus', text='@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'),
 Row(serial_no=3, ids=1467811184, date='Mon Apr 06 22:19:57 PDT 2009', flag='NO_QUERY', user='ElleCTF', text='my whole body feels itchy and like its on fire '),
 Row(serial_no=4, ids=1467811193, date='Mon Apr 06 22:19:57 PDT 2009', flag='NO_QUERY', user='Karoli', text="@nationwideclass no, it's not behaving at all. i'm mad. wh

In [211]:
df = df.drop("serial_no")

In [212]:
df.printSchema()

root
 |-- ids: long (nullable = true)
 |-- date: string (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)



In [213]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")


In [214]:
df = df.withColumn("date", to_timestamp(col("date"), "EEE MMM dd HH:mm:ss"))

In [215]:
df.show()

+----------+-------------------+--------+---------------+--------------------+
|       ids|               date|    flag|           user|                text|
+----------+-------------------+--------+---------------+--------------------+
|1467810369|1970-04-06 22:19:45|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|1467810672|1970-04-06 22:19:49|NO_QUERY|  scotthamilton|is upset that he ...|
|1467810917|1970-04-06 22:19:53|NO_QUERY|       mattycus|@Kenichan I dived...|
|1467811184|1970-04-06 22:19:57|NO_QUERY|        ElleCTF|my whole body fee...|
|1467811193|1970-04-06 22:19:57|NO_QUERY|         Karoli|@nationwideclass ...|
|1467811372|1970-04-06 22:20:00|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|1467811592|1970-04-06 22:20:03|NO_QUERY|        mybirch|         Need a hug |
|1467811594|1970-04-06 22:20:03|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|1467811795|1970-04-06 22:20:05|NO_QUERY|2Hood4Hollywood|@Tatiana_K nope t...|
|1467812025|1970-04-06 22:20:09|NO_QUERY|        mim

In [216]:
df = df.drop("ids", "flag", "user")

In [217]:
df.show()

+-------------------+--------------------+
|               date|                text|
+-------------------+--------------------+
|1970-04-06 22:19:45|@switchfoot http:...|
|1970-04-06 22:19:49|is upset that he ...|
|1970-04-06 22:19:53|@Kenichan I dived...|
|1970-04-06 22:19:57|my whole body fee...|
|1970-04-06 22:19:57|@nationwideclass ...|
|1970-04-06 22:20:00|@Kwesidei not the...|
|1970-04-06 22:20:03|         Need a hug |
|1970-04-06 22:20:03|@LOLTrish hey  lo...|
|1970-04-06 22:20:05|@Tatiana_K nope t...|
|1970-04-06 22:20:09|@twittera que me ...|
|1970-04-06 22:20:16|spring break in p...|
|1970-04-06 22:20:17|I just re-pierced...|
|1970-04-06 22:20:19|@caregiving I cou...|
|1970-04-06 22:20:19|@octolinz16 It it...|
|1970-04-06 22:20:20|@smarrison i woul...|
|1970-04-06 22:20:20|@iamjazzyfizzle I...|
|1970-04-06 22:20:22|Hollis' death sce...|
|1970-04-06 22:20:25|about to file taxes |
|1970-04-06 22:20:31|@LettyA ahh ive a...|
|1970-04-06 22:20:34|@FakerPattyPattz ...|
+----------

In [218]:
df.head(5)

[Row(date=datetime.datetime(1970, 4, 6, 22, 19, 45), text="@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"),
 Row(date=datetime.datetime(1970, 4, 6, 22, 19, 49), text="is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"),
 Row(date=datetime.datetime(1970, 4, 6, 22, 19, 53), text='@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'),
 Row(date=datetime.datetime(1970, 4, 6, 22, 19, 57), text='my whole body feels itchy and like its on fire '),
 Row(date=datetime.datetime(1970, 4, 6, 22, 19, 57), text="@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. ")]

In [219]:
# Defining functions for storage time comparison
def store_data_in_spark_sql(df):
    start_time_sql = time.time()
    df.createOrReplaceTempView("tweets")
    end_time_sql = time.time()
    return end_time_sql - start_time_sql

In [220]:
try:
    client = MongoClient('localhost', 27017)
    db = client.admin
    server_status = db.command("serverStatus")
    print("MongoDB is running.")
except Exception as e:
    print("MongoDB is not running or cannot be accessed:", e)

MongoDB is running.


In [221]:
def store_data_in_mongodb(df):
    start_time_mongodb = time.time()
    client = MongoClient('localhost', 27017)
    db = client['twitter_db']
    collection = db['twitter_collection']
    pandas_df = df.toPandas()
    data_dict = pandas_df.to_dict(orient='records')
    
    collection.insert_many(data_dict)
    end_time_mongodb = time.time()
    return end_time_mongodb - start_time_mongodb

In [222]:
time_spark_sql = store_data_in_spark_sql(df)

In [223]:
time_mongodb = store_data_in_mongodb(df)

In [224]:
print("Time taken to store data in Spark SQL:", time_spark_sql, "seconds")

Time taken to store data in Spark SQL: 0.005994558334350586 seconds


In [225]:
print("Time taken to store data in MongoDB:", time_mongodb, "seconds")

Time taken to store data in MongoDB: 80.25701999664307 seconds


In [226]:
df1 = df

In [227]:
df.show()

+-------------------+--------------------+
|               date|                text|
+-------------------+--------------------+
|1970-04-06 22:19:45|@switchfoot http:...|
|1970-04-06 22:19:49|is upset that he ...|
|1970-04-06 22:19:53|@Kenichan I dived...|
|1970-04-06 22:19:57|my whole body fee...|
|1970-04-06 22:19:57|@nationwideclass ...|
|1970-04-06 22:20:00|@Kwesidei not the...|
|1970-04-06 22:20:03|         Need a hug |
|1970-04-06 22:20:03|@LOLTrish hey  lo...|
|1970-04-06 22:20:05|@Tatiana_K nope t...|
|1970-04-06 22:20:09|@twittera que me ...|
|1970-04-06 22:20:16|spring break in p...|
|1970-04-06 22:20:17|I just re-pierced...|
|1970-04-06 22:20:19|@caregiving I cou...|
|1970-04-06 22:20:19|@octolinz16 It it...|
|1970-04-06 22:20:20|@smarrison i woul...|
|1970-04-06 22:20:20|@iamjazzyfizzle I...|
|1970-04-06 22:20:22|Hollis' death sce...|
|1970-04-06 22:20:25|about to file taxes |
|1970-04-06 22:20:31|@LettyA ahh ive a...|
|1970-04-06 22:20:34|@FakerPattyPattz ...|
+----------

In [228]:
analyzer = SentimentIntensityAnalyzer()

In [229]:
def get_sentiment(text):
    sentiment_score = analyzer.polarity_scores(text)
    return 'positive' if sentiment_score['compound'] >= 0 else 'negative'


In [230]:
sentiment_udf = udf(lambda text: get_sentiment(text), StringType())
df = df.withColumn("sentiment", sentiment_udf(df["text"]))
