In [4]:
from pyspark.sql import SparkSession, SQLContext, functions
from creds import USERNAME as UNAME
from creds import PASSWORD as PASS
import datetime
from bs4 import BeautifulSoup
import numpy as np
from random import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from joblib import dump, load
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

SCHEMA = "main_v2"
URL = "jdbc:mysql://localhost:3306/" + SCHEMA

spark = SparkSession \
    .builder \
    .appName("Database access example") \
    .config('spark.driver.extraClassPath', './mysql-connector-java-8.0.16.jar') \
    .getOrCreate()

sc = spark.sparkContext
sql_context = SQLContext(sc)
sc.setLogLevel("WARN")

def execute_query(sql_query):
  """
  Executes an arbitrary sql statement and returns the dataframe that is loaded into memory
  """
  # TODO: make generic for each type of clause in a query?
  return sql_context.read.format("jdbc").options(
      url=URL,
      user=UNAME,
      password=PASS,
      query=sql_query).load()

def get_posts(siteId, start, end):
    query = "SELECT body FROM post WHERE dateCreated BETWEEN \"{0}\" AND \"{1}\" AND siteId = {2} LIMIT 5000"
    sql_query = query.format(start, end, siteId)
    #print(sql_query)
    return execute_query(sql_query)

def format_post_data(start_date, end_date):
    print("Start Date: " + start_date.strftime("%Y-%m-%d"))
    print("End Date: " + end_date.strftime("%Y-%m-%d"))
    print("reading in data from stack overflow")
     # posts from 3d printing
    print_posts = [BeautifulSoup(p, "lxml").get_text() for p in map(lambda p: p.body, get_posts(156, start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")).collect())]
    # posts from academia
    print("reading in data from software engineering")
    academia_posts = [BeautifulSoup(p, "lxml").get_text() for p in map(lambda p: p.body, get_posts(127, start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")).collect())]
    train_data = print_posts + academia_posts
    # shuffle data so no ordering bias
    shuffle(train_data)
    print("Data size: " + str(len(train_data)))
    # labels for the data
    targets = np.array([1 if x in print_posts else 2 for x in train_data])
    return (train_data, targets)

def train_bayes():
    # pipeline for ML architecture
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),])
    # the date range to train with
    start_date = datetime.date(2019, 1, 1)
    end_date = datetime.date(2019, 4, 30)
    train_data, targets = format_post_data(start_date, end_date)
    print("Fitting model...")
    text_clf.fit(train_data, targets)
    print("Saving model...")
    dump(text_clf, "10k_so_se_bayes_model.joblib")

def test_model_bayes():
    text_clf = load("10k_so_se_bayes_model.joblib") 
    # the date range to test with
    test_start_date = datetime.date(2019, 1, 1)
    test_end_date = datetime.date(2019, 1, 31)
    test_data, test_targets = format_post_data(test_start_date, test_end_date)
    predictions = text_clf.predict(test_data)
    accuracy = np.mean(predictions == test_targets)
    print("accuracy of model: " + str(accuracy))
    print(metrics.classification_report(test_targets, predictions, target_names=["stackoverflow.com", "softwareengineering.stackexchange.com"]))
    print(metrics.confusion_matrix(test_targets, predictions))

def bayes():
    train_bayes()
    test_model_bayes()

def train_SVM():
    text_clf = Pipeline([ ('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
    start_date = datetime.date(2019, 1, 1)
    end_date = datetime.date(2019, 4, 30)
    train_data, targets = format_post_data(start_date, end_date)
    print("Fitting model...")
    text_clf.fit(train_data, targets)
    print("Saving model...")
    dump(text_clf, "10k_so_se_SVM_model.joblib")

def test_model_SVM():
    text_clf = load("10k_so_se_SVM_model.joblib") 
    # the date range to test with
    test_start_date = datetime.date(2019, 1, 1)
    test_end_date = datetime.date(2019, 1, 31)
    test_data, test_targets = format_post_data(test_start_date, test_end_date)
    predictions = text_clf.predict(test_data)
    print(metrics.classification_report(test_targets, predictions, target_names=["stackoverflow.com", "softwareengineering.stackexchange.com"]))
    print(metrics.confusion_matrix(test_targets, predictions))

def SVM():
    train_SVM()
    test_model_SVM()

bayes()
SVM()


Start Date: 2019-01-01
End Date: 2019-04-30
reading in data from stack overflow
reading in data from software engineering
Data size: 9296
Fitting model...
Saving model...
Start Date: 2019-01-01
End Date: 2019-01-31
reading in data from stack overflow
reading in data from software engineering
Data size: 6197
accuracy of model: 0.8638050669678877
                                       precision    recall  f1-score   support

                    stackoverflow.com       0.99      0.84      0.91      5001
softwareengineering.stackexchange.com       0.59      0.98      0.74      1196

                          avg / total       0.92      0.86      0.87      6197

[[4181  820]
 [  24 1172]]
Start Date: 2019-01-01
End Date: 2019-04-30
reading in data from stack overflow
reading in data from software engineering
Data size: 9296
Fitting model...
Saving model...
Start Date: 2019-01-01
End Date: 2019-01-31
reading in data from stack overflow
reading in data from software engineering
Data size: 619