In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
import numpy as np
import pandas as pd

import nltk
from nltk import tokenize

In [2]:
# Define the name of your application.
application_name = "TFE"

# Define the master address, for local mode this is local[*].
# If you don't want to use all cores on your machine please specify local[n].
master = "local[*]"
# Number of executors.
num_executors = 8
# Number of child processes per executors.
num_processes = 2
# Total number of parallel processes.
num_workers = num_executors * num_processes

In [3]:
conf = SparkConf()
conf.set("spark.app.name", application_name)
conf.set("spark.master", master)
conf.set("spark.executor.cores", str(num_processes))
conf.set("spark.executor.instances", str(num_executors))
conf.set("spark.executor.memory", "42g") # Adjust according to your requirements.
conf.set("spark.locality.wait", "0")

<pyspark.conf.SparkConf at 0x7f3c0adcbe48>

In [4]:
spark = SparkSession.builder.config(conf=conf).appName(application_name).getOrCreate()

In [5]:
df = spark.read.csv("../Data/cleaned_data.csv", header=True, inferSchema=True)

In [6]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- preprocess: string (nullable = true)



In [7]:
df.select('type').show()

+--------+
|    type|
+--------+
|    fake|
|    fake|
|    fake|
|    fake|
|    fake|
|    fake|
|    fake|
|    fake|
|reliable|
|    fake|
|    fake|
|reliable|
|reliable|
|reliable|
|reliable|
|reliable|
|reliable|
|reliable|
|reliable|
|    fake|
+--------+
only showing top 20 rows



In [9]:
fake = df.where('type = "fake"')

In [10]:
fake.show()

+---+----+--------------------+--------------------+
|_c0|type|               title|          preprocess|
+---+----+--------------------+--------------------+
|  0|fake|Surprise: Sociali...|['headline', 'bit...|
|  1|fake|Water Cooler 1/25...|['water', 'cooler...|
|  2|fake|Veteran Commentat...|['veteran', 'comm...|
|  3|fake|Lost Words, Hidde...|['lost', 'words',...|
|  4|fake|Red Alert: Bond Y...|['red', 'alert', ...|
|  5|fake|Scientists move D...|['scientists', 'd...|
|  6|fake|Why Sandwiches Mu...|['sandwiches', 'b...|
|  7|fake|Poll: Calls for W...|['poll', 'calls',...|
|  9|fake|College Basketbal...|['college', 'bask...|
| 10|fake|Conservative Figh...|['war', 'begun', ...|
| 19|fake|Celtics vs. Laker...|['view', 'gallery...|
| 20|fake|3 Republican Wome...|['gallup', 'relea...|
| 21|fake|Sarah Sanders Is ...|['president', 'do...|
| 22|fake|CNN Kept Shouting...|['donald', 'trump...|
| 23|fake|What The WH Docto...|['physician', 'pr...|
| 24|fake|Largest Turnover ...|['obama’s', 'gr