In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("Test") \
     .getOrCreate()

In [2]:
import urllib.request  # lib that handles URLs
import io
import pandas as pd
pd.options.display.max_rows=250
import numpy as np
from datetime import datetime
from datetime import timedelta

import matplotlib.pyplot as plt
plt.style.use('seaborn')
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter, FuncFormatter

import pyspark.sql.functions as func

In [3]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline, Model
from pyspark.sql.functions import col

In [4]:
df_events = spark.read.parquet("s3://bigdataproject-pr/filtered_events.parquet")
df_events.cache()

DataFrame[GlobalEventID: int, SQLDATE: int, MonthYear: int, Actor1Name: string, Actor2Name: string, EventCode: string, GoldsteinScale: double, NumMentions: int, NumSources: int, NumArticles: int, AvgTone: double, ActionGeo_CountryCode: string, ActionGeo_Fullname: string, ActionGeo_FeatureID: string, SOURCEURL: string]

In [49]:
relevant_events = df_events.where(df_events.SOURCEURL.rlike('trump'))
relevant_events = relevant_events.withColumn("AvgTone", col("AvgTone").cast("double"))

In [7]:
continuousCols = ["GoldsteinScale", "NumMentions", "NumSources", "NumArticles"]

for col in continuousCols:
    relevant_events = df_events.withColumn(col, func.col(col).cast("double"))

In [50]:
relevant_events = relevant_events.withColumn("MonthYear", func.col("MonthYear").cast("string"))
# df_events_2017 = relevant_events.where(df_events.MonthYear.like("2017%"))
# df_events_2018 = relevant_events.where(df_events.MonthYear.like("2018%"))


In [26]:
stringInputs = ["Actor1Name", "Actor2Name", "EventCode", "Actor1Geo_FullName", "Actor2Geo_FullName", 
                "ActionGeo_CountryCode","SOURCEURL"]


indexers = [ StringIndexer(inputCol=c, outputCol=c + "_IX", handleInvalid= "skip") 
            for c in stringInputs ]

encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers]

assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")



In [17]:
vector_assembler= VectorAssembler(
    inputCols=df_events.columns[0:-1]
    , outputCol='features'
)

In [11]:
type(df_events_2017)

pyspark.sql.dataframe.DataFrame

In [58]:
from pyspark.ml.feature import RFormula

formula = RFormula(formula = "AvgTone ~ .", featuresCol = 'features', labelCol = 'AvgTone', handleInvalid = "skip")
fittedRF = formula.fit(relevant_events)
preparedDF = fittedRF.transform(relevant_events)

KeyboardInterrupt: 

In [52]:
preparedDF.show(5)

+-------------+--------+---------+----------+----------+---------+--------------+-----------+----------+-----------+------------------+---------------------+--------------------+-------------------+--------------------+--------------------+
|GlobalEventID| SQLDATE|MonthYear|Actor1Name|Actor2Name|EventCode|GoldsteinScale|NumMentions|NumSources|NumArticles|           AvgTone|ActionGeo_CountryCode|  ActionGeo_Fullname|ActionGeo_FeatureID|           SOURCEURL|            features|
+-------------+--------+---------+----------+----------+---------+--------------+-----------+----------+-----------+------------------+---------------------+--------------------+-------------------+--------------------+--------------------+
|    672294855|20170612|   201706|     PARIS|  AMERICAN|      190|         -10.0|         72|        23|         72|-0.516946614432981|                   US|White House, Dist...|             531871|http://muscatinej...|(635245,[0,1,9,90...|
|    672294943|20170705|   201707|  

In [56]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(labelCol="AvgTone",featuresCol="features", numTrees = 5)

In [None]:
train, test = preparedDF.randomSplit([0.7, 0.3])

In [57]:
rf_model = rf.fit(train)

KeyboardInterrupt: 

In [59]:
spark.stop()