## 1. 读取5000IMDB电影数据

In [1]:
% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd_movies = pd.read_csv('movie_data/movie_metadata.csv')

In [3]:
pd_movies.head(2)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0


## 2. 筛选、计算个数

In [4]:
selected_data = pd_movies[['movie_title','language','country','imdb_score']]

In [5]:
selected_data.head(2)

Unnamed: 0,movie_title,language,country,imdb_score
0,Avatar,English,USA,7.9
1,Pirates of the Caribbean: At World's End,English,USA,7.1


In [6]:
average_score = selected_data['imdb_score'].mean()

In [7]:
pd_above_avgs = selected_data[selected_data['imdb_score'] > average_score]
pd_above_avgs = pd_above_avgs[pd_above_avgs['language'] == 'English']

In [8]:
pd_above_avgs.head()

Unnamed: 0,movie_title,language,country,imdb_score
0,Avatar,English,USA,7.9
1,Pirates of the Caribbean: At World's End,English,USA,7.1
2,Spectre,English,UK,6.8
3,The Dark Knight Rises,English,USA,8.5
5,John Carter,English,USA,6.6


In [9]:
pd_above_avgs.dropna().head()

Unnamed: 0,movie_title,language,country,imdb_score
0,Avatar,English,USA,7.9
1,Pirates of the Caribbean: At World's End,English,USA,7.1
2,Spectre,English,UK,6.8
3,The Dark Knight Rises,English,USA,8.5
5,John Carter,English,USA,6.6


In [10]:
pd_groupby_country = pd_above_avgs.groupby('country').count()

In [11]:
pd_groupby_country.head(2)

Unnamed: 0_level_0,movie_title,language,imdb_score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,28,28,28
Belgium,1,1,1


In [12]:
pd_groupby_country.drop(columns=['imdb_score'],inplace=True)

In [13]:
pd_groupby_country.drop(columns=['language'],inplace=True)

In [14]:
pd_groupby_country.rename(columns={'movie_title':'num'},inplace=True)
pd_groupby_country.head()

Unnamed: 0_level_0,num
country,Unnamed: 1_level_1
Australia,28
Belgium,1
Cameroon,1
Canada,52
Chile,1


In [15]:
pd_groupby_country.to_csv('movie_data/num_country_above_avgscore.csv')

In [16]:
pd_groupby_country.to_json('movie_data/num_country_above_average.json', orient='table')

## 3. 评分和预算

In [17]:
pd_budget_score = pd_movies[['genres','budget','imdb_score']]
genre_list = ['Crime', 'Family', 'Horror', 'Comedy', 'Romance', 'Action', 'Animation', 'History', 
              'Drama', 'Thriller', 'Western', 'Biography', 'Sci-Fi', 'War', 'Adventure', 'Music', 
              'Fantasy', 'Mystery', 'Sport'
             ]
pd_budget_score.head()

Unnamed: 0,genres,budget,imdb_score
0,Action|Adventure|Fantasy|Sci-Fi,237000000.0,7.9
1,Action|Adventure|Fantasy,300000000.0,7.1
2,Action|Adventure|Thriller,245000000.0,6.8
3,Action|Thriller,250000000.0,8.5
4,Documentary,,7.1


## 4. 导演和评分

In [20]:
pd_cleandata = pd.read_csv('movie_data/movie_cleandata.csv')
pd_cleandata.head(2)

Unnamed: 0.1,Unnamed: 0,movie_title,avg_rating,total_votes,genre1,genre2,genre3,#_10s,#_9s,#_8s,...,actor_2_name,gross,actor_1_name,#_voted_users,actor_3_name,language,country,budget,title_year,imdb_score
0,0,12 Years a Slave,8.1,496092,Biography,Drama,History,75556,126223,161460,...,Scoot McNairy,56667870.0,Quvenzhané Wallis,439176,Taran Killam,English,USA,20000000.0,2013.0,8.1
1,1,127 Hours,7.6,297075,Adventure,Biography,Drama,28939,44110,98845,...,Treat Williams,18329466.0,James Franco,279179,Kate Burton,English,USA,18000000.0,2010.0,7.6


In [21]:
pd_cleandata['director_name'].unique().shape

(73,)

In [22]:
pd_director = pd_cleandata[['director_name','imdb_score']].copy()
pd_director.head(2)

Unnamed: 0,director_name,imdb_score
0,Steve McQueen,8.1
1,Danny Boyle,7.6


In [23]:
pd_director.groupby('director_name').mean().head()

Unnamed: 0_level_0,imdb_score
director_name,Unnamed: 1_level_1
Adam McKay,7.8
Alejandro G. Iñárritu,7.95
Alex Garland,7.7
Alexander Payne,7.8
Alfonso Cuarón,7.8


## 6. 各年份评分变化（2010-2016电影质量）

In [24]:
pd_movies.head(5)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


### 6.1 计算各年份电影均分

In [32]:
df_year_score = pd_movies.groupby('title_year')['imdb_score'].mean()
df_year_score

title_year
1916.0    8.000000
1920.0    4.800000
1925.0    8.300000
1927.0    8.300000
1929.0    7.150000
1930.0    7.800000
1932.0    6.600000
1933.0    7.100000
1934.0    8.200000
1935.0    7.800000
1936.0    7.850000
1937.0    7.750000
1938.0    7.500000
1939.0    8.166667
1940.0    7.420000
1941.0    7.800000
1942.0    8.000000
1943.0    7.000000
1944.0    6.500000
1945.0    7.550000
1946.0    7.866667
1947.0    7.100000
1948.0    7.333333
1949.0    7.300000
1950.0    7.000000
1951.0    7.400000
1952.0    7.575000
1953.0    7.100000
1954.0    7.580000
1955.0    7.650000
            ...   
1987.0    6.412500
1988.0    6.603226
1989.0    6.787879
1990.0    6.790000
1991.0    6.493548
1992.0    6.923529
1993.0    6.727083
1994.0    6.596296
1995.0    6.565714
1996.0    6.418182
1997.0    6.485593
1998.0    6.485821
1999.0    6.407143
2000.0    6.243860
2001.0    6.305319
2002.0    6.311962
2003.0    6.285799
2004.0    6.464019
2005.0    6.356109
2006.0    6.423013
2007.0    6.510294
2

**存储为json**

In [37]:
df_year_score.to_json('movie_data/year_avgs.json',orient='records')

In [38]:
df_year_score.index

Float64Index([1916.0, 1920.0, 1925.0, 1927.0, 1929.0, 1930.0, 1932.0, 1933.0,
              1934.0, 1935.0, 1936.0, 1937.0, 1938.0, 1939.0, 1940.0, 1941.0,
              1942.0, 1943.0, 1944.0, 1945.0, 1946.0, 1947.0, 1948.0, 1949.0,
              1950.0, 1951.0, 1952.0, 1953.0, 1954.0, 1955.0, 1956.0, 1957.0,
              1958.0, 1959.0, 1960.0, 1961.0, 1962.0, 1963.0, 1964.0, 1965.0,
              1966.0, 1967.0, 1968.0, 1969.0, 1970.0, 1971.0, 1972.0, 1973.0,
              1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0,
              1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0,
              1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0,
              1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0,
              2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0,
              2014.0, 2015.0, 2016.0],
             dtype='float64', name='title_year')

### 6.2 各年份电影数量

In [42]:
df_year_num = pd_movies.groupby('title_year').count()
df_year_num

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
title_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1916.0,1,1,1,1,1,1,1,1,0,1,...,1,1,0,1,1,1,1,1,1,1
1920.0,1,1,1,1,1,1,1,1,1,1,...,1,1,0,1,0,1,1,1,1,1
1925.0,1,1,1,1,1,1,1,1,0,1,...,1,1,0,1,1,1,1,1,1,1
1927.0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1929.0,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,1,2,2,2,2
1930.0,1,1,1,1,1,1,1,1,0,1,...,1,1,1,1,1,1,1,1,1,1
1932.0,1,1,1,1,1,1,1,1,0,1,...,1,1,1,1,1,1,1,1,1,1
1933.0,2,2,2,2,2,2,2,2,1,2,...,2,2,2,2,2,2,2,2,2,2
1934.0,1,1,1,1,1,1,1,1,0,1,...,1,1,1,1,1,1,1,1,1,1
1935.0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
