In [None]:
#RUN THIS FIRST

#initialize notebook
import pandas as pd
import matplotlib as plt

#download dataset
!wget "https://raw.githubusercontent.com/Pala-Dylan/STS-Python-for-Data-Analytics/main/TMDB.csv"

#1. Introduction to Data Analysis with Pandas

### Open and view a CSV in a dataframe

In [None]:
#open csv
tmdb = pd.read_csv('TMDB.csv')
#print "head" -- first 5 rows
print(tmdb.head())
# #print whole dataframe
# print(tmdb)

###

In [None]:
#select columns
tmdb["name"]


In [None]:
#select columns
tmdb['vote_average']

In [None]:
#get summary statistics for a column
tmdb['vote_average'].describe()

In [None]:
#we can also get individual summary statistics
vote_avg = tmdb['vote_average'] #can save df columns as variables
vote_avg.mean()
#vote_avg.count() #uncomment to run
#vote_avg.std() #uncomment to run
#vote_avg.min() #uncomment to run
#vote_avg.max() #uncomment to run

In [None]:
#We can also apply conditions to our column selection. Only rows that fulfill this condition will be returned

#If we want all shows produced in Korea, we can do this:
tmdb["origin_country"] == "KR"

#You might notice we only get a bunch of rows with "True" or "False".
#This is a Boolean series--It only shows which rows satisfy the condition.


In [None]:
#To do some more interesting analysis, let's grab the actual rows that satisfy our condition and save it to a variable
Korean_shows = tmdb[tmdb["origin_country"] == "KR"]
#count of shows produced in Korea
print("Number of shows produced in Korea: " + str(len(Korean_shows)))
Korean_shows.describe()

# Introduction to Data Visualization with Matplotlib

Let's make a bar chart analyzing the number of shows rate 8.0 or above by original language.

In [None]:
#x-axis: origin country
#y-axis: number of shows rated 8.0 or above

#get the data in the form of series (lists) to plot
ratings = tmdb[tmdb['vote_average'] >= 8.0]
ratings.head()


In [None]:
#get the count of rows for each language
ratings_by_language = ratings.groupby('original_language').size().reset_index(name='count')
ratings_by_language

In [None]:
#This seems like a lot for a bar chart. Let's look at only the top 5 instead

#Sort the values by their count in descending order, then take the first 5
top_5_ratings = ratings_by_language.sort_values(by='count', ascending=False).head(5)
top_5_ratings

In [None]:
#Lets try plotting it!
top_5_ratings.plot.bar()


In [None]:
#Uh oh. The x-axis looks weird. What do we do?

#set all the labels!
ax = top_5_ratings.plot.bar(x='original_language', y='count', legend=None)
ax.set_xlabel('Language')  #Set X-axis label
ax.set_ylabel('Count')           #Set Y-axis label
ax.set_title('Top 5 Languages with Highly Rated (>8.0) Shows')  #Set the plot title

plt.show() #show plot