<a href="https://colab.research.google.com/github/aiscience-22/UA_War/blob/machine_learning_0.03/Twitter_KMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import os
import pandas as pd
import datetime as dt
import plotly.express as px
!pip install hvplot
import hvplot.pandas
from sklearn.cluster import KMeans

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
spark_version = 'spark-3.2.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:2 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Reading package lists... Done


In [3]:
# Add drivers to Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://uaresources.s3.us-west-1.amazonaws.com/joined.csv"

spark.sparkContext.addFile(url)
joined_data_df = spark.read.csv(SparkFiles.get("joined.csv"), sep=",", header=True, inferSchema=True)


# Show DataFrame
joined_data_df.show(5)

+----------+--------------------+--------+------+
|      date|                text|negative|weight|
+----------+--------------------+--------+------+
|2022-08-06| the catastrophic...|    null|  null|
|2022-08-06| once again hits ...|    null|  null|
|2022-08-06|on the night of m...|    null|  null|
|2022-08-06|global food crisi...|    null|  null|
|2022-08-06| the catastrophic...|    null|  null|
+----------+--------------------+--------+------+
only showing top 5 rows



# KMeans clustering model

In [6]:
# convert spark dataframe to pandas
joined_data_df = joined_data_df.toPandas()


In [7]:
# Clean the data
# convert text to text lenght 
joined_data_df['text_length']  = joined_data_df['text'].str.len()
# Drop the column with text values
joined_data_df.drop("text", axis=1, inplace=True)
joined_data_df.head()


Unnamed: 0,date,negative,weight,text_length
0,2022-08-06,,,100
1,2022-08-06,,,49
2,2022-08-06,,,199
3,2022-08-06,,,45
4,2022-08-06,,,100


In [19]:
# Drop the null rows
joined_data_df = joined_data_df.dropna()

In [20]:
# Check data types
joined_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124484 entries, 531025 to 655708
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   date         124484 non-null  int64  
 1   negative     124484 non-null  float64
 2   weight       124484 non-null  float64
 3   text_length  124484 non-null  int64  
dtypes: float64(2), int64(2)
memory usage: 4.7 MB


In [11]:
# import datetime as dt
joined_data_df['date'] = pd.to_datetime(joined_data_df['date'])

In [16]:
# convert datetime to numerical

joined_data_df['date'] = joined_data_df['date'].map(dt.datetime.toordinal)

In [21]:
# Initializing model with K=3 (since we already know that rhere are 3 classes of iris plants)
model = KMeans(n_clusters=3, random_state=5)
model

KMeans(n_clusters=3, random_state=5)

In [22]:
# Fitting model
model.fit(joined_data_df)

KMeans(n_clusters=3, random_state=5)

In [23]:
# Get predictions
predictions = model.predict(joined_data_df)
#print(predictions)

In [24]:
#  Add a new column to the DataFrame with the predicted classes:
joined_data_df["class"] = model.labels_
joined_data_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,date,negative,weight,text_length,class
531025,738368,0.602712,0.4,73,1
531026,738368,0.21225,0.4,108,2
531027,738368,0.49013,0.4,112,2
531028,738368,0.876023,0.4,130,2
531029,738368,0.044509,0.4,128,2


In [25]:
# Create a scatterplot of joined_data_df
joined_data_df.hvplot.scatter(x="date", y="negative", by="class")
# Plotting the clusters with three features
fig = px.scatter_3d(joined_data_df, x="text_length", y="date", z="weight", color="class", symbol="class", size="negative",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()