In [1]:
from pyspark.sql import SparkSession, Row
from pyspark import SparkConf

conf = SparkConf().setAppName("TMDB").setMaster("local[4]")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [2]:
from pyspark.sql import Row
from pyspark.sql.types import StringType, StructField, StructType
import csv
import json

In [3]:
schemaString = "budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count"
fields = [StructField(field, StringType(), True)
          for field in schemaString.split(',')]
schema = StructType(fields)

sc = spark.sparkContext
movies_RDD = sc.textFile("tmdb_5000_movies.csv") \
               .map(lambda line: Row(*next(csv.reader([line]))))  
               # next 函数将迭代器中的数据读取到数组中
movies_DF = spark.createDataFrame(movies_RDD, schema=schema)

In [4]:
def save(path, data):
    with open(path, 'w') as f:
        f.write(data)

## 体裁分析

In [5]:
movies_DF.printSchema()

root
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nullable = true)



In [6]:
movies_DF.select('genres').rdd.take(3)

[Row(genres='[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'),
 Row(genres='[{"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}]'),
 Row(genres='[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 80, "name": "Crime"}]')]

In [7]:
def countByJson(field):
    # filter(movies_DF[field] != field)
    return movies_DF \
        .select(field) \
        .filter(movies_DF[field] != '') \
        .rdd \
        .flatMap(lambda g: [(v, 1) for v in map(lambda x: x['name'], json.loads(g[field]))]) \
        .repartition(1) \
        .reduceByKey(lambda x, y: x+y)
def countByGenres():
    res = countByJson('genres').collect()
    return list(
        map(lambda v: {'genre':v[0], 'count':v[1]}, res)
    )

In [8]:
a = movies_DF \
        .select("genres") \
        .filter(movies_DF["genres"] != '') \
        .filter(movies_DF["genres"] != 'genres') \
        .rdd \
        .flatMap(lambda g: [(v, 1) for v in map(lambda x: x['name'], json.loads(g['genres']))]) \
        # .reduceByKey(lambda x, y: x+y)

In [9]:
countByJson('genres').take(1)

[('Action', 1154)]

In [84]:
save('TMDB/genres.json', json.dumps(countByGenres()))

## 前 100 个常见关键词

In [11]:
movies_DF.select('keywords').show(2, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|keywords                                                                                                                                                                                                                                                             

In [12]:
def countByKeywords():
    res = countByJson('keywords').sortBy(lambda x: x[-1], ascending=False).take(100)
    return list(
        map(lambda v: {'keyword':v[0], 'count':v[1]}, res)
    )
keywords_list = countByKeywords()

In [14]:
keywords_list[:5]

[{'keyword': 'woman director', 'count': 324},
 {'keyword': 'independent film', 'count': 318},
 {'keyword': 'duringcreditsstinger', 'count': 307},
 {'keyword': 'based on novel', 'count': 197},
 {'keyword': 'murder', 'count': 189}]

In [15]:
save('TMDB/keywords.json', json.dumps(countByKeywords()))

## TMDB 中最常见的 10 种预算数

In [16]:
movies_DF.select('budget').show(5)

+---------+
|   budget|
+---------+
|237000000|
|300000000|
|245000000|
|250000000|
|260000000|
+---------+
only showing top 5 rows



In [23]:
# 结果为json 字符串
movies_DF.filter(movies_DF["budget"] != 0).groupBy('budget').count().orderBy('count', ascending=False).toJSON().take(10)

['{"budget":"20000000","count":144}',
 '{"budget":"30000000","count":128}',
 '{"budget":"25000000","count":126}',
 '{"budget":"40000000","count":123}',
 '{"budget":"15000000","count":120}',
 '{"budget":"35000000","count":102}',
 '{"budget":"10000000","count":101}',
 '{"budget":"50000000","count":101}',
 '{"budget":"60000000","count":86}',
 '{"budget":"5000000","count":84}']

In [24]:
movies_DF.filter(movies_DF["budget"] != 0).groupBy('budget').count().orderBy('count', ascending=False).rdd.take(10)

[Row(budget='20000000', count=144),
 Row(budget='30000000', count=128),
 Row(budget='25000000', count=126),
 Row(budget='40000000', count=123),
 Row(budget='15000000', count=120),
 Row(budget='35000000', count=102),
 Row(budget='10000000', count=101),
 Row(budget='50000000', count=101),
 Row(budget='60000000', count=86),
 Row(budget='5000000', count=84)]

In [25]:
def countByBudget():
    return movies_DF \
        .filter(movies_DF["budget"] != 0) \
        .groupBy('budget') \
        .count() \
        .orderBy('count', ascending=False) \
        .toJSON() \
        .map(lambda  r: json.loads(r)) \
        .take(10)
budget_res = countByBudget()
budget_res

[{'budget': '20000000', 'count': 144},
 {'budget': '30000000', 'count': 128},
 {'budget': '25000000', 'count': 126},
 {'budget': '40000000', 'count': 123},
 {'budget': '15000000', 'count': 120},
 {'budget': '35000000', 'count': 102},
 {'budget': '10000000', 'count': 101},
 {'budget': '50000000', 'count': 101},
 {'budget': '60000000', 'count': 86},
 {'budget': '5000000', 'count': 84}]

In [27]:
save('TMDB/budget.json', json.dumps(countByBudget()))

## TMDb 中最常见电影时长(只展示电影数大于 100 的时长)

In [30]:
def distributionOfRuntime():
    return movies_DF \
        .filter(movies_DF['runtime'] != 0) \
        .groupBy('runtime') \
        .count() \
        .filter("count>=100") \
        .toJSON() \
        .map(lambda r: json.loads(r)).collect()
runtime = distributionOfRuntime()
runtime[:2]

[{'runtime': '101', 'count': 111}, {'runtime': '98', 'count': 133}]

In [31]:
save('TMDB/Runtime.json', json.dumps(distributionOfRuntime()))

## 生产电影最多的 10 大公司

In [33]:
movies_DF.select('production_companies').show(5, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|production_companies                                                                                                                                                                                       |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{"name": "Ingenious Film Partners", "id": 289}, {"name": "Twentieth Century Fox Film Corporation", "id": 306}, {"name": "Dune Entertainment", "id": 444}, {"name": "Lightstorm Entertainment", "id": 574}]|
|[{"name": "Walt Disney Pictures", "id": 2}, {"name": "Jerry Bruckheimer Films", "id": 130}, {"name": "Second Mate Productions", "id": 19936}]                                  

In [35]:
countByJson('production_companies').sortBy(lambda x: x[-1], ascending=False).take(5)

[('Warner Bros.', 319),
 ('Universal Pictures', 311),
 ('Paramount Pictures', 285),
 ('Twentieth Century Fox Film Corporation', 222),
 ('Columbia Pictures', 201)]

In [36]:
def countByCompanies():
    res = countByJson('production_companies').sortBy(lambda x: x[-1], ascending=False).take(10)
    return list(map(lambda v: {'company': v[0], 'film_count':v[1]}, res))
save('TMDB/company.json', json.dumps(countByCompanies()))

## TMDb 中的 10 大电影语言

In [37]:
movies_DF.columns

['budget',
 'genres',
 'homepage',
 'id',
 'keywords',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'vote_average',
 'vote_count']

In [39]:
movies_DF.select('spoken_languages').show(4, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|spoken_languages                                                                                                                                                                                                    |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{"iso_639_1": "en", "name": "English"}, {"iso_639_1": "es", "name": "Espa\u00f1ol"}]                                                                                                                               |
|[{"iso_639_1": "en", "name": "English"}]                                                                                                   

In [41]:
def countByLanguage():
    res = countByJson('spoken_languages').filter(lambda lang: lang[0] != "").sortBy(lambda x: x[-1], ascending=False).take(10)
    return list(map(lambda x: {'language':x[0], 'count':x[1]}, res))
countByLanguage()

[{'language': 'English', 'count': 4485},
 {'language': 'Français', 'count': 437},
 {'language': 'Español', 'count': 351},
 {'language': 'Deutsch', 'count': 262},
 {'language': 'Italiano', 'count': 188},
 {'language': 'Pусский', 'count': 185},
 {'language': '普通话', 'count': 107},
 {'language': '日本語', 'count': 97},
 {'language': 'Português', 'count': 68},
 {'language': 'العربية', 'count': 67}]

In [42]:
save('TMDB/Language.json', json.dumps(countByLanguage()))

## 预算与评价的关系

考虑预算与评价之间的关系

标题 预算  评价

In [45]:
def budgetVote():
    return movies_DF.select('title', 'budget','vote_average').filter(movies_DF['budget'] != 0).filter(movies_DF['vote_count']>= 100).collect()
budgetVote()[:3]

[Row(title='Avatar', budget='237000000', vote_average='7.2'),
 Row(title="Pirates of the Caribbean: At World's End", budget='300000000', vote_average='6.9'),
 Row(title='Spectre', budget='245000000', vote_average='6.3')]

In [46]:
save('TMDB/bugetvote.json', json.dumps(budgetVote()))

## 发行时间与评价的关系

In [48]:
def dateVote():
    return movies_DF.select(movies_DF["release_date"], "vote_average", "title").filter(movies_DF["release_date"] != "").filter(movies_DF["vote_count"] > 100).collect()

save('TMDB/date_vote.json', json.dumps(dateVote()))

## 3. 流行度和评价的关系

In [50]:
def popVote():
    return movies_DF.select("title", "popularity", "vote_average").filter(movies_DF["popularity"] != 0).filter(movies_DF["vote_count"] > 100).collect()

save('TMDB/pop_vote.json', json.dumps(popVote()))

## 4. 公司生产的电影平均分和数量的关系

公司  (评分, 1)


In [56]:
from pyspark.sql.functions import explode

In [71]:
source = movies_DF.filter(movies_DF['production_companies'] !="").filter(movies_DF['vote_count'] >= 100)

trans = source.rdd.flatMap(
    lambda g: [
        (company, (float(g['vote_average']), 1))  for company in map(
            lambda x: x['name'], json.loads(g['production_companies']))]
    ).repartition(1)
trans.take(5)

[('Ingenious Film Partners', (7.2, 1)),
 ('Twentieth Century Fox Film Corporation', (7.2, 1)),
 ('Dune Entertainment', (7.2, 1)),
 ('Lightstorm Entertainment', (7.2, 1)),
 ('Walt Disney Pictures', (6.9, 1))]

In [72]:
trans.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).map(lambda x: (x[0], x[1][0] / x[1][1], x[1][1])).take(3)

[('Ingenious Film Partners', 6.1240000000000006, 25),
 ('Twentieth Century Fox Film Corporation', 6.286813186813187, 182),
 ('Dune Entertainment', 5.9105263157894745, 57)]

In [73]:
def movieVote():
    source = movies_DF.filter(movies_DF['production_companies'] !="").filter(movies_DF['vote_count'] >= 100)

    trans = source.rdd.flatMap(
        lambda g: [
            (company, (float(g['vote_average']), 1))  for company in map(
                lambda x: x['name'], json.loads(g['production_companies']))]
        ).repartition(1)
    result = trans.reduceByKey(
        lambda x, y: (x[0] + y[0], x[1] + y[1])).map(
            lambda x: (x[0], x[1][0] / x[1][1], x[1][1])).collect()
    return result

In [74]:
save('TMDB/movies_vote.json', json.dumps(moviesVote()))

## 电影预算和营收的关系

[电影标题，预算，收入]

In [76]:
def budgetRevenue():
    return movies_DF \
        .select('title', 'budget', 'revenue') \
        .filter(movies_DF['budget'] != 0) \
        .filter(movies_DF['revenue'] != 0) \
        .collect()
save('TMDB/budget_revenue.json', json.dumps(budgetRevenue()))

In [1]:
def func(a:int, l:list):
    a += 1
    print(a)
    l.append(1)
    print(l)
a = 1
l = [0, 2]

func(a, l)

2
[0, 2, 1]


In [3]:
a, l

(1, [0, 2, 1])