In [None]:
# import modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# create and display dataset
data = {
    'apples': [3, 2, 0, 1],
    'oranges': [0, 3, 7, 2]
}
purchases = pd.DataFrame(data)
purchases

In [None]:
# update index column
purchases = pd.DataFrame(data, index=['June', 'Robert', 'Lily', 'David'])
purchases

In [None]:
# read dataset from file
df = pd.read_csv('data/purchases.csv')
df

In [None]:
# read from and update file
df = pd.read_csv('data/purchases.csv', index_col=0)
df

In [None]:
# export dataset
df.to_csv('data/purchases.csv')

In [None]:
# import dataset
movies_df = pd.read_csv('data/IMDB-Movie-Data.csv', index_col='Title')
movies_df.head()

In [None]:
# show head
movies_df.head()

In [None]:
# show tail
movies_df.tail()

In [None]:
# show dataset info
movies_df.info()

In [None]:
# show dataset dimensions
movies_df.shape

In [None]:
# combine datasets (example)
temp_df = movies_df.append(movies_df)
temp_df.head()

In [None]:
# remove duplicate entries
temp_df.drop_duplicates(inplace=True)
temp_df.head()

In [None]:
# list columns
movies_df.columns

In [None]:
# rename columns
movies_df.rename(columns={
	'Runtime (Minutes)': 'Runtime',
	'Revenue (Millions)': 'Revenue_millions'
}, inplace=True)
movies_df.head()

In [None]:
# rename all columns to lowercase
movies_df.columns = [col.lower() for col in movies_df]
movies_df.head()

In [None]:
# show sum of incomplete data
movies_df.isnull().sum()

In [None]:
# remove NaN data
movies_df.dropna(axis=1).shape

In [None]:
# display single column
revenue=movies_df['revenue_millions']
revenue.head()

In [None]:
# find mean of column
revenue_mean=revenue.mean()
revenue_mean

In [None]:
# fill NaN data with mean data
revenue.fillna(revenue_mean, inplace=True)
revenue.head()

In [None]:
# use describe
movies_df.describe()

In [None]:
# use describe on a column
movies_df['genre'].describe()

In [None]:
# show frequency of values in a column
movies_df['genre'].value_counts().head()

In [None]:
# display corrolation of columns
movies_df.corr()

In [None]:
# see type Series from string selection
type(movies_df[
	'genre'
])

In [None]:
# see type DataFrame from array selection
type(movies_df[
	['genre']
])

In [None]:
# see type DataFrame from array (multiple)
type(movies_df[
	['genre', 'rating']
])

In [None]:
# select columns from data
movies_df[['genre', 'rating']].head()

In [None]:
# locate row by index data
movies_df.loc['Prometheus']

In [None]:
# locate row by index location
movies_df.iloc[0]

In [None]:
# return result of condition
(movies_df['director'] == 'Ridley Scott').head()

In [None]:
# return data based on string condition
movies_df[movies_df['director'] == 'Ridley Scott'].head()

In [None]:
# return data based on numerical condition
movies_df[movies_df['rating'] >= 8.6].head()

In [None]:
# return data based on multiple conditions
movies_df[(movies_df['director'] == 'Christopher Nolan') | (movies_df['director'] == 'Ridley Scott')].head()

In [None]:
# return data based on multiple conditions (concise)
movies_df[movies_df['director'].isin(
	['Christopher Nolan', 'Ridley Scott']
)].head()

In [None]:
# return data based on many conditions
movies_df[
	(
		(movies_df['year'] >= 2005) &
		(movies_df['year'] <= 2010) &
		(movies_df['rating'] > 8.0) &
		(movies_df['revenue_millions'] < movies_df['revenue_millions'].quantile(0.25))
	)
]

In [None]:
# create function to iterate through data
def rating_function(x):
	if x >= 8.0:
		return 'good'
	else:
		return 'bad'

In [None]:
# use function on dataset
movies_df['rating_category'] = movies_df['rating'].apply(rating_function)
movies_df.head()

In [None]:
# update matplotlib parameters
plt.rcParams.update(
	{
		'font.size': 20,
		'figure.figsize': (10, 8)
	}
)

In [None]:
# create scatter plot from dataset
movies_df.plot(
	kind = 'scatter',
	x = 'rating',
	y = 'revenue_millions',
	title = 'Revenue (millions) v Rating'
);

In [None]:
fig = plt.figure(figsize = (10, 7))
sns.regplot(
	x = movies_df.rating,
	y = movies_df.revenue_millions
);
plt.title('Rating v Revenue');