In [58]:
import pyspark as ps
import json
import ast
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType, FloatType
from src import DataCleaning
from src import FeatureEngineer
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
spark = (ps.sql.SparkSession.builder
        .appName("sandbox")
        .getOrCreate()
        )

sc = spark.sparkContext

In [7]:
rdd = sc.textFile('data/raw_data.csv')

In [8]:
rdd_formatted = DataCleaning.clean_data(rdd).filter(lambda x: x[-1] != '')

In [13]:
def mkdict(L):
    d = {}
    for n in L:
        d[n] = 1
    return d

In [18]:
def combineDict(d1, d2):
    d = d1.copy()
    for key in d2.keys():
        if key in d1:
            d[key] += 1
        else:
            d[key] = 1
    return d

In [47]:
developers = rdd_formatted.map(lambda x: x[3])
publishers = rdd_formatted.map(lambda x: x[4])
devCounts = developers.map(mkdict).reduce(combineDict)
pubCounts = publishers.map(mkdict).reduce(combineDict)

In [56]:
rdd_formatted.map(lambda x: len(x[3])).count()

28101

In [80]:
publishers.take(1)

[["Ben 'Yahtzee' Croshaw"]]

In [28]:
print(len(pubCounts.keys()))

15508


In [31]:
developers.map(lambda x: len(x)).max()

16

In [35]:
publishers.map(lambda x: len(x)).max()

5

In [84]:
def getGenres(L):
    result = []
    for d in L:
        if 'description' in d:
            result.append(d['description'])
    return result

In [85]:
genres = rdd_formatted.map(lambda x: getGenres(x[6])).map(mkdict).reduce(combineDict)

In [87]:
genres

{'Adventure': 5587,
 'Action': 6436,
 'Indie': 10778,
 'Simulation': 2609,
 'Racing': 540,
 'Sports': 714,
 'RPG': 2282,
 'Casual': 5552,
 'Strategy': 2806,
 'Early Access': 1318,
 'Design & Illustration': 102,
 'Animation & Modeling': 80,
 'Game Development': 16,
 'Education': 83,
 'Massively Multiplayer': 150,
 'Photo Editing': 27,
 'Violent': 275,
 'Gore': 161,
 'Audio Production': 40,
 'Nudity': 74,
 'Utilities': 149,
 'Software Training': 53,
 'Sexual Content': 68,
 'Web Publishing': 14,
 'Free to Play': 22,
 'Video Production': 50,
 'Accounting': 7}

In [50]:
numSingles = 0
for dev in devCounts.keys():
    if devCounts[dev] == 1:
        numSingles += 1
print(numSingles)

15497


In [54]:
numSingles = 0
for pub in pubCounts.keys():
    if pubCounts[pub] == 1:
        numSingles += 1
print(numSingles)

12703


In [67]:
non1DevCounts = list(filter((lambda x: x != 1), list(devCounts.values())))

In [72]:
result = [0]*100
for val in devCounts.values():
    result[val] += 1
    
print(result)

[0, 15497, 2080, 659, 278, 136, 82, 46, 34, 20, 9, 12, 12, 6, 6, 2, 2, 4, 2, 0, 4, 0, 1, 2, 3, 2, 0, 0, 0, 2, 0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [74]:
result = [0]*200
for val in pubCounts.values():
    result[val] += 1
    
print(result)

[0, 12703, 1649, 501, 233, 89, 72, 43, 27, 24, 19, 20, 16, 8, 9, 11, 9, 4, 9, 4, 1, 5, 3, 6, 2, 0, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 0, 0, 2, 1, 1, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [78]:
'AIVIK LLC' in pubCounts

False

In [79]:
publishers.map(lambda x: 1 if 'AIVIK LLC' in x else 0).sum()

0