In [1]:
import findspark
findspark.init()
import pyspark
import random

In [None]:
sc = pyspark.SparkContext(appName="Pi")

In [None]:
num_samples = 100000000
def inside(p):     
    x, y = random.random(), random.random()
    return x*x + y*y < 1

In [None]:
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()

Have a look at this: [link](https://stackoverflow.com/questions/32362783/how-to-change-sparkcontext-properties-in-interactive-pyspark-session):

```python
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])
```

or 


```python
SparkContext.setSystemProperty('spark.driver.maxResultSize', '10g')
```

In [2]:
from pyspark.sql import functions as F
from pyspark.context import SparkContext
SparkContext.setSystemProperty('spark.executor.memory', '16g')
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)

In [None]:
from pyspark.sql.window import Window

In [None]:


days = lambda i: i * 86400


# cSchema = StructType([StructField("WordList", ArrayType(StringType()))])

# notice extra square brackets around each element of list 
# test_list = [['Hello', 'world']], [['I', 'am', 'fine']]

# df = spark.createDataFrame(data=test_list, schema=cSchema) 

df = sqlContext.createDataFrame([(17, "2017-03-10T15:27:18+00:00"),
                        (13, "2017-03-15T12:27:18+00:00"),
                        (25, "2017-03-18T11:27:18+00:00")],
                        ["dollars", "timestampGMT"])

df = df.withColumn('timestampGMT', df.timestampGMT.cast('timestamp'))

#create window by casting timestamp to long (number of seconds)
w = (Window.orderBy(F.col("timestampGMT").cast('long')).rangeBetween(-days(7), 0))

df = df.withColumn('rolling_average', F.avg("dollars").over(w))

In [None]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD
data = [
    LabeledPoint(0.0, [0.0]),
    LabeledPoint(1.0, [1.0]),
    LabeledPoint(1.0, [2.0]),
    LabeledPoint(1.0, [3.0])
]

svm = SVMWithSGD.train(sc.parallelize(data))

In [None]:
import csv
csv_file = '/Users/sergey/Dev/Kaggle/LANL-Earthquake-Prediction/train.csv'
txt_file = '/Users/sergey/Dev/Kaggle/LANL-Earthquake-Prediction/train.txt'
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r") as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

In [44]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[1], [values[0]])

In [45]:
data = sc.textFile("/Users/sergey/Dev/Kaggle/LANL-Earthquake-Prediction/train.txt")
content = data.zipWithIndex().filter(lambda kv: kv[1] > 1).keys()
parsedData = content.map(parsePoint)

In [48]:
parsedData.take(3)

[LabeledPoint(1.4690999821, [6.0]),
 LabeledPoint(1.469099981, [8.0]),
 LabeledPoint(1.4690999799, [5.0])]

In [49]:
# Build the model
model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001)

# Evaluate the model on training data
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds \
    .map(lambda vp: (vp[0] - vp[1])**2) \
    .reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))

# Save and load model
model.save(sc, "/Users/sergey/Dev/Kaggle/LANL-Earthquake-Prediction/models/pythonLinearRegressionWithSGDModel")
sameModel = LinearRegressionModel.load(sc, "/Users/sergey/Dev/Kaggle/LANL-Earthquake-Prediction/models/pythonLinearRegressionWithSGDModel")

Mean Squared Error = 45.73167411668977


In [None]:
df = sqlContext.read.format("csv").options(header="true", inferschema='true').load("/Users/sergey/Dev/Kaggle/LANL-Earthquake-Prediction/train.csv")

In [None]:
df.printSchema()

In [None]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['acoustic_data'], outputCol = 'features')
v_df = vectorAssembler.transform(df)
v_df = v_df.select(['features', 'time_to_failure'])
v_df.show(3)

In [None]:
splits = v_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [None]:
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.ml.regression import LinearRegression
# lr = LinearRegressionWithSGD.train(sc.parallelize(v_df), iterations=10)
lr = LinearRegression(featuresCol = 'features', labelCol='time_to_failure', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [None]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD

In [None]:
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

In [None]:
lrm = LinearRegressionWithSGD.train(sc.parallelize(rdd_df.collect()), iterations=10)