## Install Packages

In [1]:
# pyspark

!pip install pyspark



In [2]:
!pip install -U textblob



In [3]:
!pip install tensorflow



In [4]:
!pip install gensim



In [5]:
!pip install wordcloud



In [6]:
!pip install plotly



In [7]:
!pip install statsmodels



In [8]:
!pip install scikit-learn



In [9]:
!pip install statsmodels



## Import Libraries

In [10]:
################################################### spark #################################################################

from pyspark.sql import SparkSession

################################################### base #################################################################

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

################################################### nlp #################################################################

import numpy as np

import gensim
import nltk
import re
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer

################################################### time series #################################################################



from statsmodels.tsa.seasonal import seasonal_decompose
from nltk.corpus import stopwords as nltk_stopwords

###################################################  ml #################################################################

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional,Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

################################################# confusion matrix #################################################################

from sklearn.metrics import confusion_matrix

## Viewing Data

In [73]:
def load_and_process_data(spark, hdfs_path):
    try:
        # Read the CSV file into a DataFrame
        data = spark.read.option("header", "true").csv(hdfs_path)

        # Check if the data was loaded successfully
        if data is not None:
            # Define new column names
            new_column_names = ["number", "id_tweet", "date", "query", "user_id", "tweet"]

            # Rename the columns using the alias method
            for i in range(len(new_column_names)):
                data = data.withColumnRenamed(data.columns[i], new_column_names[i])

            # Create a temporary view from the DataFrame
            data.createOrReplaceTempView("CA2_ProjectTweets")

            # Run Spark SQL queries using the same SparkSession
            result = spark.sql("SELECT * FROM CA2_ProjectTweets")

            # Display the first 5 rows
            result.show(5)

            return result
        else:
            print("Data not loaded successfully.")
            return None

    except Exception as e:
        print("An error occurred:", str(e))
        return None




## Create Spark

In [74]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName('ProjectTweets').getOrCreate()

# Specify the HDFS path to your CSV file
hdfs_path = "/content/ProjectTweets.csv"

# Call the function and get the result
tweets = load_and_process_data(spark, hdfs_path)

# Check if the result is not None before attempting further operations
if tweets is not None:
    # You can perform additional operations on the result DataFrame if needed
    # For example, you can continue processing the data or perform analysis

    # Display the schema of the DataFrame
    tweets.printSchema()

    # Perform additional operations or analysis as needed
    # ...

    # Stop the Spark session when done
    spark.stop()
else:
    print("Data loading and processing failed.")


+------+----------+--------------------+--------+-------------+--------------------+
|number|  id_tweet|                date|   query|      user_id|               tweet|
+------+----------+--------------------+--------+-------------+--------------------+
|     1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|scotthamilton|is upset that he ...|
|     2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|     mattycus|@Kenichan I dived...|
|     3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|      ElleCTF|my whole body fee...|
|     4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|       Karoli|@nationwideclass ...|
|     5|1467811372|Mon Apr 06 22:20:...|NO_QUERY|     joy_wolf|@Kwesidei not the...|
+------+----------+--------------------+--------+-------------+--------------------+
only showing top 5 rows

root
 |-- number: string (nullable = true)
 |-- id_tweet: string (nullable = true)
 |-- date: string (nullable = true)
 |-- query: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- tweet: str

## Show Column Names

In [75]:
tweets.columns

['number', 'id_tweet', 'date', 'query', 'user_id', 'tweet']

## Show Schema

In [76]:
tweets.printSchema()

root
 |-- number: string (nullable = true)
 |-- id_tweet: string (nullable = true)
 |-- date: string (nullable = true)
 |-- query: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- tweet: string (nullable = true)



## Show Dtypes

In [77]:
tweets.dtypes

[('number', 'string'),
 ('id_tweet', 'string'),
 ('date', 'string'),
 ('query', 'string'),
 ('user_id', 'string'),
 ('tweet', 'string')]

## Add column names and convert to pandas

In [37]:
tweets = tweets.toPandas()
tweets.columns = ['target', 'id', 'date', 'query', 'username', 'content']
tweets.head()

Unnamed: 0,target,id,date,query,username,content
0,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,5,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


## Dtypes

In [39]:
tweets.dtypes

target       int32
id           int64
date        object
query       object
username    object
content     object
dtype: object

## Cheking Missing Values

In [40]:
tweets.isna().sum()

target      0
id          0
date        0
query       0
username    0
content     0
dtype: int64