In [1]:
!pip install pyspark



In [42]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd

In [52]:
spark = SparkSession.builder.appName(name="MSA Phase 1 Assignment Analytics").master("local[*]").getOrCreate().newSession()
spark

In [5]:
#at this point the raw data is uploaded
#the data used is the anime recommendations dataset found here https://www.kaggle.com/CooperUnion/anime-recommendations-database
!ls

anime.csv  bigdata_analytics.ipynb  README.md


In [102]:
# Read in the anime csv and store it into a dataframe.
anime = pd.read_csv("anime.csv")

In [103]:
anime.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.11,80679
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.1,72534
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.11,81109


In [104]:
anime.count()

anime_id    12294
name        12294
genre       12232
type        12269
episodes    12294
rating      12064
members     12294
dtype: int64

In [105]:
anime = anime.dropna()
anime.count()

anime_id    12017
name        12017
genre       12017
type        12017
episodes    12017
rating      12017
members     12017
dtype: int64

In [106]:
anime = anime.drop(['episodes'], axis = 1)

In [107]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.16,151266


In [108]:
import numpy as np
from itertools import chain

In [109]:
# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split(',')))

# calculate lengths of splits
lens = anime['genre'].str.split(',').map(len)

In [110]:
# create new dataframe, repeating or chaining as appropriate
anime_df = pd.DataFrame({'anime_id': np.repeat(anime['anime_id'], lens),
                    'name': np.repeat(anime['name'], lens),
                    'genre': chainer(anime['genre']),
                    'type': np.repeat(anime['type'], lens),
                    'rating': np.repeat(anime['rating'], lens),
                    'members': np.repeat(anime['members'], lens)})

In [111]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,rating,members
0,32281,Kimi no Na wa.,Drama,Movie,9.37,200630
0,32281,Kimi no Na wa.,Romance,Movie,9.37,200630
0,32281,Kimi no Na wa.,School,Movie,9.37,200630
0,32281,Kimi no Na wa.,Supernatural,Movie,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,Action,TV,9.26,793665


In [112]:
anime_genres = anime_df.drop(['name','type','rating','members'], axis = 1)
anime_type = anime_df.drop(['name','genre','rating','members'], axis = 1)

In [113]:
anime_genres.head()

Unnamed: 0,anime_id,genre
0,32281,Drama
0,32281,Romance
0,32281,School
0,32281,Supernatural
1,5114,Action


In [114]:
anime_type.head()

Unnamed: 0,anime_id,type
0,32281,Movie
0,32281,Movie
0,32281,Movie
0,32281,Movie
1,5114,TV


In [116]:
anime.to_csv('clean_anime.csv')
anime_genres.to_csv('anime_genres.csv')
anime_type.to_csv('anime_type.csv')