In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BDAS').getOrCreate()

In [2]:
Total_Clean_data = spark.read.csv('Total_Clean_data.csv', inferSchema=True, header=True)
Total_Clean_data.printSchema()

root
 |-- Rank: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- NA_Sales: double (nullable = true)
 |-- EU_Sales: double (nullable = true)
 |-- JP_Sales: double (nullable = true)
 |-- Other_Sales: double (nullable = true)
 |-- Global_Sales: double (nullable = true)
 |-- Bin_Global_Sales: double (nullable = true)



In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf = SparkConf()).getOrCreate()

from pyspark.ml.linalg import Vector,Vectors
from pyspark.sql.types import DoubleType, StructType, StructField
from pyspark.sql import Row,functions
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel,\
BinaryLogisticRegressionSummary,LogisticRegression


In [4]:
Total_Clean_data = Total_Clean_data.toPandas()

In [5]:
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Action",1)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Adventure",2)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Fighting",3)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Misc",4)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Platform",5)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Puzzle",6)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Racing",7)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Role-Playing",8)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Shooter",9)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Simulation",10)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Sports",11)
Total_Clean_data["Genre"] = Total_Clean_data["Genre"].replace("Strategy",12)

In [6]:
Total_Clean_data = spark.createDataFrame(Total_Clean_data)

In [18]:
Total_Clean_data.show()

+----+--------+----+-----+--------------------+--------------------+--------+--------+--------+-----------+------------+----------------+
|Rank|Platform|Year|Genre|           Publisher|                Name|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|Bin_Global_Sales|
+----+--------+----+-----+--------------------+--------------------+--------+--------+--------+-----------+------------+----------------+
|   1|     Wii|2006|   11|            Nintendo|          Wii Sports|   41.49|   29.02|    3.77|       8.46|       82.74|            16.0|
|   2|     NES|1985|    5|            Nintendo|   Super Mario Bros.|   29.08|    3.58|    6.81|       0.77|       40.24|             8.0|
|   3|     Wii|2008|    7|            Nintendo|      Mario Kart Wii|   15.85|   12.88|    3.79|       3.31|       35.82|             7.0|
|   4|     Wii|2009|   11|            Nintendo|   Wii Sports Resort|   15.75|   11.01|    3.28|       2.96|        33.0|             6.0|
|   5|      GB|1996|    8|        

In [7]:
df_assembler = VectorAssembler(inputCols=['NA_Sales','EU_Sales',"JP_Sales","Other_Sales"], outputCol='features')
data = df_assembler.transform(Total_Clean_data).select('features','Genre')
data.show(5,truncate=False)

+-----------------------+-----+
|features               |Genre|
+-----------------------+-----+
|[41.49,29.02,3.77,8.46]|11   |
|[29.08,3.58,6.81,0.77] |5    |
|[15.85,12.88,3.79,3.31]|7    |
|[15.75,11.01,3.28,2.96]|11   |
|[11.27,8.89,10.22,1.0] |8    |
+-----------------------+-----+
only showing top 5 rows



In [8]:
labelIndexer = StringIndexer().setInputCol("Genre"). \
    setOutputCol("indexedLabel").fit(data)
data = labelIndexer.transform(data)
data.show(5)

+--------------------+-----+------------+
|            features|Genre|indexedLabel|
+--------------------+-----+------------+
|[41.49,29.02,3.77...|   11|         1.0|
|[29.08,3.58,6.81,...|    5|         7.0|
|[15.85,12.88,3.79...|    7|         6.0|
|[15.75,11.01,3.28...|   11|         1.0|
|[11.27,8.89,10.22...|    8|         3.0|
+--------------------+-----+------------+
only showing top 5 rows



In [9]:
featureIndexer = VectorIndexer(maxCategories=5).setInputCol("features"). \
    setOutputCol("indexedFeatures").fit(data)
data = featureIndexer.transform(data)
data.show(5)

+--------------------+-----+------------+--------------------+
|            features|Genre|indexedLabel|     indexedFeatures|
+--------------------+-----+------------+--------------------+
|[41.49,29.02,3.77...|   11|         1.0|[41.49,29.02,3.77...|
|[29.08,3.58,6.81,...|    5|         7.0|[29.08,3.58,6.81,...|
|[15.85,12.88,3.79...|    7|         6.0|[15.85,12.88,3.79...|
|[15.75,11.01,3.28...|   11|         1.0|[15.75,11.01,3.28...|
|[11.27,8.89,10.22...|    8|         3.0|[11.27,8.89,10.22...|
+--------------------+-----+------------+--------------------+
only showing top 5 rows



In [10]:
trainData, testData = data.randomSplit([0.7, 0.3])

In [11]:
lr = LogisticRegression(labelCol='indexedLabel',featuresCol='indexedFeatures',\
                        maxIter=100, regParam=0.3, elasticNetParam=0.8).fit(trainData)
testData = lr.transform(testData)
testData .show(5)

+--------------+-----+------------+---------------+--------------------+--------------------+----------+
|      features|Genre|indexedLabel|indexedFeatures|       rawPrediction|         probability|prediction|
+--------------+-----+------------+---------------+--------------------+--------------------+----------+
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00984754724407...|[0.20196073226354...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00984754724407...|[0.20196073226354...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00984754724407...|[0.20196073226354...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00984754724407...|[0.20196073226354...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00984754724407...|[0.20196073226354...|       0.0|
+--------------+-----+------------+---------------+--------------------+--------------------+----------+
only showing top 5 rows



In [12]:
testData.select("probability","Genre").show(20,False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|probability                                                                                                                                                                                                                                     |Genre|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|[0.20196073226354186,0.14001950207237016,0.10236854910375326,0.09152433843804084,0.07920533801837759,0.07720999474007917,0.0728722769161091,0.05430677131135929,0.05231143695111195,0.05283195754024099,0.03921243668024399,0.03617666596477165]|1    |
|[0.

In [27]:
##############################``````````````````````````###############################

In [8]:
Total_Clean_data1 = Total_Clean_data.toPandas()

In [9]:
GenreData = Total_Clean_data1.groupby('Genre')\
                                    .sum()\
                                    .loc[:,["Global_Sales","JP_Sales","EU_Sales","Other_Sales","NA_Sales"]]\
                                    .sort_values(by='Global_Sales',ascending=False)

In [29]:
GenreData.head(20)

Unnamed: 0_level_0,Global_Sales,JP_Sales,EU_Sales,Other_Sales,NA_Sales
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1722.83,158.64,516.48,184.92,861.77
11,1309.24,134.76,371.34,132.65,670.09
9,1026.2,38.18,310.45,101.9,575.16
8,923.79,350.25,187.57,59.38,326.5
5,829.13,130.65,200.65,51.51,445.99
4,789.87,106.67,211.77,73.92,396.92
7,726.76,56.61,236.31,76.68,356.93
3,444.05,87.15,100.0,36.19,220.74
10,389.69,63.54,113.02,31.34,181.51
6,242.21,56.68,50.52,12.47,122.01


In [10]:
Genre_assembler = VectorAssembler(inputCols=['NA_Sales','EU_Sales',"JP_Sales","Other_Sales"], outputCol='features')
G_data = Genre_assembler.transform(Total_Clean_data).select('features','Genre')
G_data.show(5,truncate=False)

+-----------------------+-----+
|features               |Genre|
+-----------------------+-----+
|[41.49,29.02,3.77,8.46]|11   |
|[29.08,3.58,6.81,0.77] |5    |
|[15.85,12.88,3.79,3.31]|7    |
|[15.75,11.01,3.28,2.96]|11   |
|[11.27,8.89,10.22,1.0] |8    |
+-----------------------+-----+
only showing top 5 rows



In [11]:
labelIndexer = StringIndexer().setInputCol("Genre"). \
    setOutputCol("indexedLabel").fit(G_data)
G_data = labelIndexer.transform(G_data)
G_data.show(5)

+--------------------+-----+------------+
|            features|Genre|indexedLabel|
+--------------------+-----+------------+
|[41.49,29.02,3.77...|   11|         1.0|
|[29.08,3.58,6.81,...|    5|         7.0|
|[15.85,12.88,3.79...|    7|         6.0|
|[15.75,11.01,3.28...|   11|         1.0|
|[11.27,8.89,10.22...|    8|         3.0|
+--------------------+-----+------------+
only showing top 5 rows



In [12]:
featureIndexer = VectorIndexer(maxCategories=5).setInputCol("features"). \
    setOutputCol("indexedFeatures").fit(G_data)
G_data = featureIndexer.transform(G_data)
G_data.show(5)

+--------------------+-----+------------+--------------------+
|            features|Genre|indexedLabel|     indexedFeatures|
+--------------------+-----+------------+--------------------+
|[41.49,29.02,3.77...|   11|         1.0|[41.49,29.02,3.77...|
|[29.08,3.58,6.81,...|    5|         7.0|[29.08,3.58,6.81,...|
|[15.85,12.88,3.79...|    7|         6.0|[15.85,12.88,3.79...|
|[15.75,11.01,3.28...|   11|         1.0|[15.75,11.01,3.28...|
|[11.27,8.89,10.22...|    8|         3.0|[11.27,8.89,10.22...|
+--------------------+-----+------------+--------------------+
only showing top 5 rows



In [17]:
trainG_Data, testG_Data = G_data.randomSplit([0.7, 0.3])

In [18]:
lr_G = LogisticRegression(labelCol='indexedLabel',featuresCol='indexedFeatures',\
                        maxIter=100, regParam=0.3, elasticNetParam=0.8).fit(trainG_Data)
testG_Data = lr_G.transform(testG_Data)
testG_Data .show(50)

+--------------+-----+------------+---------------+--------------------+--------------------+----------+
|      features|Genre|indexedLabel|indexedFeatures|       rawPrediction|         probability|prediction|
+--------------+-----+------------+---------------+--------------------+--------------------+----------+
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00105232043237...|[0.20038871464435...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00105232043237...|[0.20038871464435...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00105232043237...|[0.20038871464435...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00105232043237...|[0.20038871464435...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00105232043237...|[0.20038871464435...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.00105232043237...|[0.20038871464435...|       0.0|
|(4,[0],[0.01])|    1|         0.0| (4,[0],[0.01])|[1.0

In [15]:
testG_Data.select("Genre","probability").show(20,False)

+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Genre|probability                                                                                                                                                                                                                                       |
+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1    |[0.20056229512231827,0.13883044856844232,0.1051106595422623,0.09035840402228444,0.0782402331236304,0.07797679488705549,0.07534240972667869,0.05356472054100409,0.052071911390107736,0.050491429976603226,0.042149479485525644,0.0353012136140872