In [1]:
from pyspark.ml.feature import RegexTokenizer, HashingTF, IDF, CountVectorizer, Normalizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, DoubleType, StringType, StructType, StructField
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from nltk.sentiment import SentimentIntensityAnalyzer
from pyspark.ml import Pipeline, PipelineModel
import pyspark

# Data Retrieval

In [2]:
data_file = r"/home/jovyan/repos/distributed-sentiment-analysis-on-twitter-data/twitter_scraper/twitter_data_final.csv"
vader_analyzer = SentimentIntensityAnalyzer()

In [3]:
conf = pyspark.SparkConf().setAll([('spark.executor.memory', '2g'),
                                   ('spark.executor.cores', '3'), # 4
                                   ('spark.cores.max', '3'), # 4
                                   ('spark.driver.memory','4g')])

# Initialize a Spark session
spark = SparkSession \
    .builder \
    .appName("SentimentAnalysis") \
    .config(conf=conf) \
    .getOrCreate()

sc = spark.sparkContext

In [4]:
sc.getConf().getAll()

[('spark.executor.memory', '2g'),
 ('spark.driver.memory', '4g'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 'local-1524036857806'),
 ('spark.cores.max', '3'),
 ('spark.driver.port', '39319'),
 ('spark.driver.host', 'de4f1c03e850'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.cores', '3'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.name', 'SentimentAnalysis'),
 ('spark.ui.showConsoleProgress', 'true')]

In [5]:
# define the data schema(format/structure) for our twitter data in the csv file
twitter_data_schema = StructType([StructField("date_str", StringType(), True),
                                  StructField("tweet_id", StringType(), True),
                                  StructField("text", StringType(), True),
                                  StructField("location", StringType(), True),
                                  StructField("user_id", StringType(), True),
                                  StructField("user_name", StringType(), True),
                                  StructField("user_location", StringType(), True),
                                  StructField("user_url", StringType(), True),
                                  StructField("user_description", StringType(), True),
                                  StructField("place_id", StringType(), True),
                                  StructField("place_url", StringType(), True),
                                  StructField("place_type", StringType(), True),
                                  StructField("place_countrycode", StringType(), True),
                                  StructField("place_country", StringType(), True),
                                  StructField("place_boundingboxtype", StringType(), True),
                                  StructField("entities_hashtags", StringType(), True),
                                  StructField("entities_urls", StringType(), True),
                                  StructField("entities_mentions", StringType(), True),
                                  StructField("entities_symbols", StringType(), True),
                                  StructField("entities_media", StringType(), True),
                                  StructField("entities_polls", StringType(), True),])

In [6]:
df_raw = spark.read.csv(
    data_file, schema=twitter_data_schema
)

In [7]:
text_and_loc_only = df_raw.select("text", "location")

In [8]:
text_and_loc_only.cache()

text_and_loc_only.count()

156011

# Tweet Cleaning Function

In [9]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

at_user_pat = r'@[A-Za-z0-9_]+'  # r'@[\w]+'
url_pat = r'https?://[^ ]+'  # r'https?:\/\/[^\s]+'
www_pat = r'www.[^ ]+'
repeating_chars_pat = r'([A-Za-z])\1+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(at_user_pat, 'USERNAME', bom_removed)
    stripped = re.sub(url_pat, 'URL', stripped)
    stripped = re.sub(www_pat, 'URL', stripped)
    stripped = re.sub(repeating_chars_pat, r'\1\1', stripped)

    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

# Data Pre-Processing

In [10]:
df_filtered = text_and_loc_only.filter("location like '%,___'")
df_filtered.show(truncate=True)

+--------------------+------------------+
|                text|          location|
+--------------------+------------------+
|:thumbs_up::red_h...|     San Diego, CA|
|USER_NAME Thank y...|          Waco, TX|
|I am fucking cryi...|         Savoy, IL|
|USER_NAME Get rid...|  Gaithersburg, MD|
|My dog made me so...|       Baytown, TX|
| Good kill USER_NAME|   Los Angeles, CA|
|USER_NAME BEAUTIF...|    Valparaiso, IN|
|I wanna play bask...|       Houston, TX|
|My comfort zone :...|         Hobbs, NM|
|people in flint, ...|Pembroke Pines, FL|
|Looking through o...|    San Marcos, TX|
|USER_NAME does ha...|       Kenosha, WI|
|USER_NAME USER_NA...|     Livermore, CA|
|Ryan was wonderin...|       Benicia, CA|
|you truly are a p...|         Boise, ID|
|So if anyone’s wa...|    Marysville, WA|
|Why am I dying to...|       Hampton, GA|
|      I miss him URL|      Pasadena, TX|
|And now I truly f...|   Hattiesburg, MS|
|YOU Inspire ME :h...| Jeffersontown, KY|
+--------------------+------------

In [11]:
df_filtered.count()

96958

In [12]:
udf_tweet_cleaner = udf(tweet_cleaner)
df_preprocessed = df_filtered.withColumn("text", udf_tweet_cleaner(col("text")))
df_preprocessed.show(truncate=True)

+--------------------+------------------+
|                text|          location|
+--------------------+------------------+
|thumbs up red hea...|     San Diego, CA|
|user name thank y...|          Waco, TX|
|am fucking crying...|         Savoy, IL|
|user name get rid...|  Gaithersburg, MD|
|my dog made me so...|       Baytown, TX|
| good kill user name|   Los Angeles, CA|
|user name beautif...|    Valparaiso, IN|
|wanna play basket...|       Houston, TX|
|my comfort zone s...|         Hobbs, NM|
|people in flint m...|Pembroke Pines, FL|
|looking through o...|    San Marcos, TX|
|user name does ha...|       Kenosha, WI|
|user name user na...|     Livermore, CA|
|ryan was wonderin...|       Benicia, CA|
|you truly are pie...|         Boise, ID|
|so if anyone want...|    Marysville, WA|
|why am dying to l...|       Hampton, GA|
|        miss him url|      Pasadena, TX|
|and now truly fee...|   Hattiesburg, MS|
|you inspire me ho...| Jeffersontown, KY|
+--------------------+------------

In [14]:
df_preprocessed.cache()

df_preprocessed.count()

96958

In [13]:
def count(text):
    return 1

udf_count = udf(count)

# Ranking Result of NLTK Analyzer 

In [70]:
def nltk_prediction(text):
    result = vader_analyzer.polarity_scores(str(text))
    prediction = 1 if result['compound'] >= 0 else 0

    return prediction

udf_nltk_prediction = udf(nltk_prediction)

In [69]:
def nltk_score(text):
    result = vader_analyzer.polarity_scores(str(text))
    prediction = result['compound']

    return prediction

udf_nltk_score = udf(nltk_score)

In [71]:
df_scored = df_preprocessed.withColumn("prediction", udf_nltk_prediction(col("text")))
df_scored = df_scored.withColumn("score", udf_nltk_score(col("text")))
df_scored = df_scored.withColumn("count", udf_count(col("text")))

df_scored.cache()
df_scored.count()

96958

In [72]:
# Ranking by averaging 1/0 values
df_scored.groupBy('location').agg({'prediction': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(prediction)', ascending=False).show(n=1000)

+-----------------+------------------+----------+
|         location|   avg(prediction)|sum(count)|
+-----------------+------------------+----------+
|      Garrett, IN|0.9623115577889447|     398.0|
|        Indio, CA|0.8706624605678234|     317.0|
|       Dayton, OH|0.8483412322274881|     211.0|
|   Enterprise, NV|0.8202614379084967|     306.0|
|    Las Vegas, NV|0.8172169811320755|     848.0|
|    Manhattan, NY| 0.813033359193173|    1289.0|
|       Boston, MA|0.8088888888888889|     225.0|
|       Denver, CO|0.8074712643678161|     348.0|
|       Queens, NY|0.8034782608695652|     575.0|
|     Paradise, NV|0.8013513513513514|     740.0|
|   Long Beach, CA|0.8012048192771084|     498.0|
|    Henderson, NV|0.7908496732026143|     306.0|
|San Francisco, CA| 0.790356394129979|     954.0|
|     Columbus, OH|0.7901234567901234|     405.0|
|    San Diego, CA|0.7898866608544028|    1147.0|
| Prairie View, TX|0.7897727272727273|     352.0|
|       Tucson, AZ|0.7890995260663507|     422.0|


In [73]:
# Ranking by averaging compound scores
df_scored.groupBy('location').agg({'score': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(score)', ascending=False).show(n=1000)

+-----------------+--------------------+----------+
|         location|          avg(score)|sum(count)|
+-----------------+--------------------+----------+
|       Dayton, OH|  0.1744227488151659|     211.0|
|         Mesa, AZ| 0.16151333333333337|     225.0|
|      Anaheim, CA| 0.15517384196185285|     367.0|
|   Long Beach, CA| 0.14699759036144583|     498.0|
|       Denton, TX| 0.14574785478547855|     303.0|
|     Columbus, OH| 0.14424370370370374|     405.0|
|    Manhattan, NY| 0.13960256012412714|    1289.0|
|     Portland, OR| 0.13648405511811024|     508.0|
|       Tucson, AZ|  0.1359649289099526|     422.0|
|       Boston, MA|  0.1310991111111111|     225.0|
|      Seattle, WA| 0.12837383627608348|     623.0|
|    Henderson, NV| 0.12800718954248366|     306.0|
|Sunrise Manor, NV| 0.12630948616600793|     253.0|
|     Paradise, NV| 0.12502067567567568|     740.0|
|    San Diego, CA| 0.12257053182214471|    1147.0|
|     Honolulu, HI| 0.12249582542694501|     527.0|
|      El Pa

# Ranking Result of Logistic Regression

In [45]:
def extract_probability(input):
    return input.toArray().tolist()[1]

udf_extract_probability = udf(extract_probability)

In [15]:
trigramwocs_pipelineFit_loaded = PipelineModel.load("ngram_cv_idf_lr")

In [16]:
%%time
df_ngram_cv_idf_lr = trigramwocs_pipelineFit_loaded.transform(df_preprocessed)

df_ngram_cv_idf_lr = df_ngram_cv_idf_lr.withColumn("count", udf_count(col("text")))
df_ngram_cv_idf_lr = df_ngram_cv_idf_lr.withColumn("score", udf_extract_probability(df_ngram_cv_idf_lr.probability))

df_ngram_cv_idf_lr.cache()
df_ngram_cv_idf_lr.count()

CPU times: user 40 ms, sys: 20 ms, total: 60 ms
Wall time: 8.85 s


In [42]:
# Ranking by averaging 1/0 values
df_ngram_cv_idf_lr.groupBy('location').agg({'prediction': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(prediction)', ascending=False).show(n=1000)

+-----------------+------------------+----------+
|         location|   avg(prediction)|sum(count)|
+-----------------+------------------+----------+
|      Garrett, IN| 0.949748743718593|     398.0|
|        Indio, CA|0.7917981072555205|     317.0|
|    Manhattan, NY|0.7812257564003103|    1289.0|
|    Las Vegas, NV|0.7783018867924528|     848.0|
|     Paradise, NV|0.7756756756756756|     740.0|
|   Enterprise, NV|0.7745098039215687|     306.0|
|     Columbus, OH|0.7407407407407407|     405.0|
|      Seattle, WA|0.7319422150882825|     623.0|
|    San Diego, CA|0.7271142109851787|    1147.0|
|       Boston, MA|0.7244444444444444|     225.0|
|     Portland, OR|0.7244094488188977|     508.0|
|        Bronx, NY|0.7225274725274725|     364.0|
|     Brooklyn, NY|0.7182890855457227|     678.0|
|       Dayton, OH|0.7156398104265402|     211.0|
|       Denver, CO|0.7155172413793104|     348.0|
|San Francisco, CA|0.7148846960167715|     954.0|
|      Chicago, IL|0.7115009746588694|    1539.0|


In [43]:
# Ranking by averaging probabilities
df_ngram_cv_idf_lr.groupBy('location').agg({'score': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(score)', ascending=False).show(n=1000)

+-----------------+------------------+----------+
|         location|        avg(score)|sum(count)|
+-----------------+------------------+----------+
|        Indio, CA|0.7245208035809556|     317.0|
|    Manhattan, NY|0.6623431055795451|    1289.0|
|     Paradise, NV|0.6475812177109463|     740.0|
|   Enterprise, NV| 0.639097519482643|     306.0|
|      Seattle, WA|0.6381876633733661|     623.0|
|    Las Vegas, NV|0.6337571101436307|     848.0|
|       Boston, MA|0.6307830193089408|     225.0|
|     Portland, OR|0.6267392730646126|     508.0|
|San Francisco, CA|0.6263860393874178|     954.0|
|         Mesa, AZ|0.6127656225186913|     225.0|
|    San Diego, CA|0.6127165498721924|    1147.0|
|     Brooklyn, NY|0.6116889320228371|     678.0|
|     Columbus, OH|0.6088577375317159|     405.0|
|      Chicago, IL|0.6085637549272475|    1539.0|
|       Dayton, OH| 0.606674197831893|     211.0|
|  Los Angeles, CA|0.6064917376102913|    5326.0|
|       Tucson, AZ|0.6052426105194849|     422.0|


# Ranking Result of Naive Bayes

In [48]:
nb_pipelineFit_loaded = PipelineModel.load("ngram_cv_idf_nb")

In [49]:
%%time
df_ngram_cv_idf_nb = nb_pipelineFit_loaded.transform(df_preprocessed)

df_ngram_cv_idf_nb = df_ngram_cv_idf_nb.withColumn("count", udf_count(col("text")))
df_ngram_cv_idf_nb = df_ngram_cv_idf_nb.withColumn("score", udf_extract_probability(df_ngram_cv_idf_nb.probability))

df_ngram_cv_idf_nb.cache()
df_ngram_cv_idf_nb.count()

CPU times: user 40 ms, sys: 40 ms, total: 80 ms
Wall time: 8.67 s


In [50]:
# Ranking by averaging 1/0 values
df_ngram_cv_idf_nb.groupBy('location').agg({'prediction': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(prediction)', ascending=False).show(n=1000)

+-----------------+-------------------+----------+
|         location|    avg(prediction)|sum(count)|
+-----------------+-------------------+----------+
|        Indio, CA|  0.804416403785489|     317.0|
|     Paradise, NV|  0.722972972972973|     740.0|
|   Enterprise, NV| 0.7124183006535948|     306.0|
|    Las Vegas, NV| 0.7099056603773585|     848.0|
|     Portland, OR| 0.7047244094488189|     508.0|
|    Manhattan, NY| 0.7020946470131885|    1289.0|
|       Dayton, OH| 0.7014218009478673|     211.0|
|      Seattle, WA| 0.6966292134831461|     623.0|
|       Boston, MA| 0.6888888888888889|     225.0|
|San Francisco, CA| 0.6834381551362684|     954.0|
|    San Diego, CA| 0.6809067131647777|    1147.0|
|       Tucson, AZ| 0.6729857819905213|     422.0|
|     Honolulu, HI| 0.6679316888045541|     527.0|
|     Columbus, OH| 0.6666666666666666|     405.0|
|  Los Angeles, CA| 0.6629740893728877|    5326.0|
|      Anaheim, CA| 0.6566757493188011|     367.0|
|    Henderson, NV| 0.653594771

In [51]:
# Ranking by averaging probabilities
df_ngram_cv_idf_nb.groupBy('location').agg({'score': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(score)', ascending=False).show(n=1000)

+-----------------+-------------------+----------+
|         location|         avg(score)|sum(count)|
+-----------------+-------------------+----------+
|        Indio, CA| 0.8052503685839236|     317.0|
|   Enterprise, NV| 0.7207666821255344|     306.0|
|    Las Vegas, NV| 0.7179040426814047|     848.0|
|     Paradise, NV|  0.715987303945537|     740.0|
|    Manhattan, NY| 0.7007458034770002|    1289.0|
|     Portland, OR| 0.7002250231266418|     508.0|
|      Seattle, WA| 0.6956269658918839|     623.0|
|       Boston, MA| 0.6817111825127657|     225.0|
|San Francisco, CA| 0.6810358238950329|     954.0|
|    San Diego, CA| 0.6776217011588281|    1147.0|
|       Dayton, OH| 0.6774919008114745|     211.0|
|       Tucson, AZ| 0.6705379933337214|     422.0|
|     Honolulu, HI| 0.6620596550954828|     527.0|
|  Los Angeles, CA| 0.6614588392771339|    5326.0|
|    Henderson, NV| 0.6585136407114579|     306.0|
|     Columbus, OH| 0.6584407864261872|     405.0|
|      Chicago, IL| 0.654097116

# Ranking Result of Gradient Boost Tree

In [52]:
gbt_pipelineFit_loaded = PipelineModel.load("ngram_cv_idf_gbt")

In [53]:
%%time
df_ngram_cv_idf_gbt = gbt_pipelineFit_loaded.transform(df_preprocessed)

df_ngram_cv_idf_gbt = df_ngram_cv_idf_gbt.withColumn("count", udf_count(col("text")))
df_ngram_cv_idf_gbt = df_ngram_cv_idf_gbt.withColumn("score", udf_extract_probability(df_ngram_cv_idf_gbt.probability))

df_ngram_cv_idf_gbt.cache()
df_ngram_cv_idf_gbt.count()

CPU times: user 60 ms, sys: 40 ms, total: 100 ms
Wall time: 8.56 s


In [54]:
# Ranking by averaging 1/0 values
df_ngram_cv_idf_gbt.groupBy('location').agg({'prediction': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(prediction)', ascending=False).show(n=1000)

+-----------------+--------------------+----------+
|         location|     avg(prediction)|sum(count)|
+-----------------+--------------------+----------+
|        Indio, CA|  0.6182965299684543|     317.0|
|    Manhattan, NY|  0.5360744763382467|    1289.0|
|       Denton, TX| 0.49504950495049505|     303.0|
|      Atlanta, GA| 0.48468708388814913|     751.0|
|     Paradise, NV|  0.4810810810810811|     740.0|
|     Richmond, VA| 0.47950819672131145|     244.0|
|San Francisco, CA| 0.46645702306079667|     954.0|
|        Tampa, FL| 0.46568627450980393|     204.0|
|   Huntsville, AL| 0.44841269841269843|     252.0|
|     Columbus, OH|  0.4469135802469136|     405.0|
|       Durham, NC|  0.4468864468864469|     273.0|
|      Anaheim, CA| 0.44141689373297005|     367.0|
|  Tallahassee, FL| 0.43731778425655976|     343.0|
|   San Marcos, TX| 0.43567251461988304|     342.0|
|     Honolulu, HI|  0.4269449715370019|     527.0|
|      Memphis, TN| 0.42679127725856697|     321.0|
|       Bost

In [55]:
# Ranking by averaging probabilities
df_ngram_cv_idf_gbt.groupBy('location').agg({'score': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(score)', ascending=False).show(n=1000)

+-----------------+-------------------+----------+
|         location|         avg(score)|sum(count)|
+-----------------+-------------------+----------+
|        Indio, CA|  0.501394052800608|     317.0|
|      Atlanta, GA|0.49581520683361263|     751.0|
|    Manhattan, NY|0.49556882373327554|    1289.0|
|     Paradise, NV|0.49479909446520365|     740.0|
|         Mesa, AZ| 0.4945197708520246|     225.0|
|   San Marcos, TX|  0.493439970817821|     342.0|
|       Denton, TX|0.49339823052578713|     303.0|
|   Enterprise, NV|0.49027325356024093|     306.0|
|     Richmond, VA|0.48984174715573375|     244.0|
|     Honolulu, HI| 0.4888684906695527|     527.0|
|      Memphis, TN| 0.4882033107703744|     321.0|
|San Francisco, CA|  0.487459037717402|     954.0|
|     Columbus, OH| 0.4869454189847211|     405.0|
|      Norfolk, VA|0.48669111139569005|     222.0|
|   Fort Worth, TX| 0.4857237192245204|     460.0|
|  Tallahassee, FL|0.48486552070430705|     343.0|
|       Dayton, OH| 0.484507005

# Ranking Result of Linear SVM

In [56]:
lsvc_pipelineFit_loaded = PipelineModel.load("ngram_cv_idf_lsvc")

In [65]:
%%time
df_ngram_cv_idf_lsvc = lsvc_pipelineFit_loaded.transform(df_preprocessed)

df_ngram_cv_idf_lsvc = df_ngram_cv_idf_lsvc.withColumn("count", udf_count(col("text")))
df_ngram_cv_idf_lsvc = df_ngram_cv_idf_lsvc.withColumn("score", udf_extract_probability(df_ngram_cv_idf_lsvc.rawPrediction))

df_ngram_cv_idf_lsvc.cache()
df_ngram_cv_idf_lsvc.count()

CPU times: user 60 ms, sys: 30 ms, total: 90 ms
Wall time: 8.28 s


In [67]:
# Ranking by averaging 1/0 values
df_ngram_cv_idf_lsvc.groupBy('location').agg({'prediction': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(prediction)', ascending=False).show(n=1000)

+-----------------+------------------+----------+
|         location|   avg(prediction)|sum(count)|
+-----------------+------------------+----------+
|      Garrett, IN|0.9472361809045227|     398.0|
|        Indio, CA|0.7917981072555205|     317.0|
|    Manhattan, NY|0.7820015515903801|    1289.0|
|   Enterprise, NV|0.7777777777777778|     306.0|
|    Las Vegas, NV|0.7759433962264151|     848.0|
|     Paradise, NV|0.7743243243243243|     740.0|
|    San Diego, CA|0.7306015693112468|    1147.0|
|       Boston, MA|0.7244444444444444|     225.0|
|       Queens, NY|0.7234782608695652|     575.0|
|        Bronx, NY|0.7225274725274725|     364.0|
|     Brooklyn, NY|0.7212389380530974|     678.0|
|      Seattle, WA|0.7191011235955056|     623.0|
|       Dayton, OH|0.7156398104265402|     211.0|
|       Denver, CO|0.7155172413793104|     348.0|
|San Francisco, CA|0.7138364779874213|     954.0|
|     Columbus, OH|0.7135802469135802|     405.0|
|     Portland, OR|0.7125984251968503|     508.0|


In [68]:
# Ranking by averaging raw predictions
df_ngram_cv_idf_lsvc.groupBy('location').agg({'score': 'avg', 'count': 'sum'}).filter(col('sum(count)') > 200).sort('avg(score)', ascending=False).show(n=1000)

+-----------------+--------------------+----------+
|         location|          avg(score)|sum(count)|
+-----------------+--------------------+----------+
|        Indio, CA|  0.9175605454247628|     317.0|
|    Manhattan, NY|  0.6269278426365356|    1289.0|
|     Paradise, NV| 0.48815083843738233|     740.0|
|       Boston, MA|  0.4676841245435888|     225.0|
|      Seattle, WA|  0.4629923189413974|     623.0|
|San Francisco, CA|   0.440644045010809|     954.0|
|     Portland, OR|   0.434840949197935|     508.0|
|    Las Vegas, NV|  0.4217519150538475|     848.0|
|   Enterprise, NV|  0.4206029384573034|     306.0|
|       Queens, NY|  0.4051090978584367|     575.0|
|     Brooklyn, NY| 0.37099099523257784|     678.0|
|    San Diego, CA|  0.3652296060339492|    1147.0|
|  Los Angeles, CA|  0.3314198896847423|    5326.0|
|      Chicago, IL| 0.32883183817988465|    1539.0|
|         Mesa, AZ|  0.3196190367668085|     225.0|
|      Memphis, TN|  0.3165832552322442|     321.0|
|       Tucs