## Libraries

In [1]:
import pandas as pd # library for data manipulation
import matplotlib.pyplot as plt # library for visualization
import seaborn as sns # library for visualization
sns.set() # this command sets the seaborn chart style as the default

from matplotlib.ticker import PercentFormatter #converts values into percentage format

## Bar Chart

In [2]:
# Read the car data csv file for this task.
# Make sure the file is in the same directory, as your Python Notebook, otherwise you'll need to specify the file location.
df_used_cars = pd.read_csv("bar_chart_data.csv")

FileNotFoundError: [Errno 2] File bar_chart_data.csv does not exist: 'bar_chart_data.csv'

In [None]:
df_used_cars

In [None]:
plt.figure(figsize = (9, 6)) #determine the size of the chart
# To create a bar chart with matplotlib you can use the 'bar' function.
plt.bar(x = df_used_cars["Brand"], #specify the x axis
        height = df_used_cars["Cars Listings"], #specify the y axis
        color = "midnightblue") # the color for the bars
plt.xticks(rotation = 45, fontsize = 13) # rotate and format the labels for the x-axis
plt.yticks(fontsize = 13) # format the y-axis
plt.title("Cars Listings by Brand", fontsize = 16, fontweight = "bold") #add and format the title for the chart
plt.ylabel("Number of Listings", fontsize = 13 ) #add a title for the y-axis
plt.savefig("Used Cars Bar.png") # you can export your chart as a picture
plt.show() # depending on which environment you're using without this line you're chart may not display properly.

## Pie Chart

In [None]:
# load data. Number of cars based on their engine fuel types. Four engine fuel types in total : Diesel, Gas, Petrol and Other.
df_fuel_engine_type = pd.read_csv("pie_chart_data.csv")

In [None]:
df_fuel_engine_type

In [None]:
# Set the matplotlib color cycle using a seaborn palette
sns.set_palette('colorblind')

In [None]:
# create a pie chart with the pie function from matplotlib's pyplot.
# Labels are added as the number of cars in each category in the form of percentages.
# Additional labels formatting using textprops. Note it is a dictionary.  

plt.figure(figsize = (10, 8))
plt.pie(df_fuel_engine_type['Number of Cars'], 
        labels = df_fuel_engine_type['Engine Fuel Type'].values, 
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})
plt.legend()
plt.title('Cars by Engine Fuel Type', fontsize = 18, fontweight = 'bold')
plt.show()

## Stacked Area Chart

In [None]:
# load data. Engine Fuel Types for used cars. Data is collected from 1982 until 2016
df_fuel_engine_types = pd.read_csv("stacked_area_chart_data.csv")

In [None]:
df_fuel_engine_types

In [None]:
# Create a stacked area chart, with pyplots 'stackplot'. On the x-axis we have the time line - year 1982 to 2016.
# On the y-axis we have the three categories, 'Diesel', 'Petrol' and 'Gas' stacked on top of each other. 
# Note that this ordering is chosen specifically so that categories are from largest to smallest.
# This helps us compare the size of the catogories, as people have difficulties determining the size of non-rectangular shapes.

# Provide a color list, so that each category can have a specific color.
# Color names should appear in the same order as the stacked area plot categories appear.  
colors = ["#011638", "#7e2987", "ef2026"]
# label list for the legend. Names should appear in the same order as the stacked area plot categories appear. 
labels = ["Diesel", "Petrol", "Gas"]
sns.set_style("white") # Use seaborn's 'white' theme to introduce a white background, instead of the default grey.
plt.figure(figsize = (12, 6))
plt.stackplot(df_fuel_engine_types["Year"],
              df_fuel_engine_types["Diesel"],
              df_fuel_engine_types["Petrol"],
              df_fuel_engine_types["Gas"],
              colors = colors,
              edgecolor = 'none')
plt.xticks(df_fuel_engine_types["Year"], rotation = 45) # Include x-axis labels for each year and rotate labels by 45 degrees.
plt.legend(labels = labels, loc = "upper left") # Add a legend and specify its location on the chart.
plt.ylabel("Number of Cars", fontsize = 13)
plt.title("Popularity of Engine Fuel Types (1982 - 2016)", fontsize = 14, weight = "bold")
sns.despine() # Remove top and right border of the chart.
plt.show()

## Line Chart

In [None]:
# load line chart data
df_spx_ftse_00_10 = pd.read_csv("line_chart_data.csv")

In [None]:
df_spx_ftse_00_10

In [None]:
# Convert 'Date' into datetime format. Otherwise it will be regarded as a simple string
# and it's highly likely that many operations or transformations on the date column won't have the desired result. 
# This is a crucial step in any time series analysis.
df_spx_ftse_00_10["new_date"] = pd.to_datetime(df_spx_ftse_00_10["Date"])

In [None]:
df_spx_ftse_00_10["new_date"]

In [None]:
# line chart for the two indices: S&P 500 (in blue) vs FTSE 100 (in orange). The time frame is from 2000 until end of 2010
labels = ["S&P 500", "FTSE 100"] # legend labels list
plt.figure(figsize = (20, 8))
plt.plot(df_spx_ftse_00_10["new_date"], df_spx_ftse_00_10["GSPC500"])
plt.plot(df_spx_ftse_00_10["new_date"], df_spx_ftse_00_10["FTSE100"])
# chart formatting elements: title, labels and legend
plt.title("S&P vs FTSE Returns (2000 - 2010)", fontsize = 14, fontweight = "bold")
plt.ylabel("Returns")
plt.xlabel("Date")
plt.legend(labels = labels, fontsize = "large")
plt.show()

In [None]:

# introduce a new data frame for a specific time frame. Starts 07/01/2008 and ends 12/31/2008
df_spx_ftse_H2_08 = df_spx_ftse_00_10[(df_spx_ftse_00_10.new_date >= '2008-07-01') &
                                      (df_spx_ftse_00_10.new_date <= '2008-12-31')]

In [None]:
df_spx_ftse_H2_08

In [None]:
# line chart for the two indices S&P 500 (in dark blue) vs FTSE 100 (in crimson red). The time frame is H2(second half) of 2008 
plt.figure(figsize = (20, 8))
plt.plot(df_spx_ftse_H2_08["new_date"], df_spx_ftse_H2_08["GSPC500"], color = "midnightblue")
plt.plot(df_spx_ftse_H2_08["new_date"], df_spx_ftse_H2_08["FTSE100"], color = "crimson")
plt.title("S&P vs FTSE Returns (H2 2008)", fontsize = 14, fontweight = "bold")
plt.ylabel("Returns")
plt.xlabel("Date")
plt.legend(labels = labels, fontsize = "large")
plt.show()

## Histogram

In [None]:
# Load real estate data set for the histogram
df_real_estate = pd.read_csv("histogram_data.csv")

In [None]:
df_real_estate

In [None]:
sns.set_style("white") # override the default matplotlib style, to avoid the grey background and grid
plt.figure(figsize = (8, 6)) # determine the size of the figure
plt.hist(df_real_estate["Price"], # the variable on which to create the histogram
         bins = 8, # create a histogram with 8 bins
         color = "#108A99")
plt.title("Distribution of Real Estate Prices", fontsize = 14, weight = "bold")
plt.xlabel("Price in (000's $)")
plt.ylabel("Number of Properties")
sns.despine() # removes the top and right border of our graph
plt.show()

## Scatter Plot

In [None]:
#load the real estate data for the scatter plot
df_real_estate = pd.read_csv("scatter_data.csv")

In [None]:
df_real_estate

In [None]:
# scatter plot with matplotlib's pyplot. Depending on what features you'd like to include,
# this version might not give as many high level options as seaborn. However, technically, it should perform just as well. 
plt.figure(figsize = (12, 8)) # size of the figure
scatter = plt.scatter(df_real_estate['Area (ft.)'],
            df_real_estate['Price'], 
            alpha = 0.6, #transparency level of points on the plot. Used to avoid overplotting
            c = df_real_estate['Building Type'], # additional third feature
            cmap = 'viridis') #color map for the third variable
# add and format additional elements, such as legends, titles and axis labels
plt.legend(*scatter.legend_elements(),
           loc = "upper left",
           title = "Building Type")
plt.title("Relationship between Area and Price of California Real Estate", 
          fontsize = 14, 
          weight = "bold")
plt.xlabel("Area (sq. ft.)", weight = "bold")
plt.ylabel("Price (000's of $)")
plt.show()

In [None]:
# the seaborn scatter plot is used very often, as it is usually very intuitive to handle.
# This is also a main advantage of the library, the fact that it is easy to use. Though in some cases,
# it doesn't provide as many options as matplotlib
plt.figure(figsize = (12, 8))
sns.scatterplot(df_real_estate['Area (ft.)'],
                df_real_estate['Price'],
                hue = df_real_estate['Building Type'], #third variable is added with the hue parameter
                palette = ['black', 'darkblue', 'purple', 'pink', 'white'], #colors for the third parameter
                s = 100) # size of points on the scatter
plt.title("Relationship between Area and Price of California Real Estate", 
          fontsize = 14, 
          weight = "bold")
plt.xlabel("Area (sq. ft.)", weight = "bold")
plt.ylabel("Price (000's of $)")
plt.show()

## Regression Plot

In [None]:
#load the marketing data set for the regression plot
df_ad_expenditure = pd.read_csv("scatter_plot_ii.csv")

In [None]:
df_ad_expenditure

In [None]:
# A regplot accepts x and y in a variety of formats. For the lmplot the data need to be strings in the so called 'long-form'.
#plt.figure(figsize = (10, 8))
sns.set(rc = {'figure.figsize': (9,6)}) # control the size of the figure with the rc dictionary
sns.regplot(x = "Budget", 
            y = "Sales", 
            data = df_ad_expenditure, 
            scatter_kws = {'color': 'k'}, # color for the points
            line_kws = {'color': 'red'}) # color for the regression line
plt.xlabel("Ad Expenditure in (000's $)")
plt.ylabel("Sales in (000's Units)")
plt.title("Effect of Ad Expenditure on Sales", fontsize = 14, weight = "bold")
plt.show()

In [None]:
sns.lmplot(x = "Budget", 
           y = "Sales", 
           data = df_ad_expenditure, 
           height = 10, # height and width of the plot
           scatter_kws = {'color': 'k'}, # color for the points
           line_kws = {'color': 'red'}) # color for the regression line
plt.xlabel("Ad Expenditure in (000's $)")
plt.ylabel("Sales in (000's Units)")
plt.title("Effect of Ad Expenditure on Sales", fontsize = 14, weight = "bold")
plt.show()

## Bar and Line Chart

In [None]:
df_kdnuggets = pd.read_csv("bar_line_chart_data.csv") #read in the KDnuggets survey data file

In [None]:
# The data frame, which we'll use to create the combo plot. The two charts will share the x-axis which is Year.
# The bar chart will have the number of Participants on the primary x-axis(on the lefthandside of the chart)
# The line chart will have the Python Users in percentages on a secondary y-axis(on the righthandside of the chart)
df_kdnuggets

In [None]:
# Creating the combination chart. Here we use a figure with axes.
# The first axes or subplot is the bar chart, the second is the line chart. 
fig, ax = plt.subplots(figsize = (10, 7))

#creating and styling the bar chart
ax.bar(df_kdnuggets["Year"],
       df_kdnuggets["Participants"],
       color = "k")
ax.set_ylabel("Number of Participants", 
              weight = "bold")
ax.tick_params(axis = "y", 
               width = 2, 
               labelsize = "large")
ax1 = ax.twinx()
#changing the secondary y-axis to display percentages on a scale from 0% to 100%
ax1.set_ylim(0, 1)
ax1.yaxis.set_major_formatter(PercentFormatter(xmax = 1.0))
#creating and styling the line chart
ax1.plot(df_kdnuggets["Year"],
         df_kdnuggets["Python Users"], 
         color = "#b60000", 
         marker = "D")
ax1.set_ylabel("Python Users", 
               color = "#b60000", 
               weight = "bold")
ax1.tick_params(axis = "y", 
                colors = "#b60000", 
                width = 2, 
                labelsize = "large")
ax.set_title("KD Nuggets Survey Python Users (2012 - 2019)", fontsize = "14", weight = "bold")
plt.show()