# Data Visualisation with Python 


<li> <b>  Data visualization refers to the techniques used to communicate data or information by encoding it as visual objects (e.g., points, lines or bars). <br>
<li> <b>  The main goal of data visualization is to communicate information clearly and effectively through graphical means

**Check out this workshop for an introdcution to Pandas** 

*  [Pandas Workshop](https://github.com/IBMDeveloperUK/python-pandas-workshop)
* [Code Pattern](https://developer.ibm.com/technologies/data-science/tutorials/data-analysis-in-python-using-pandas)

## DataViz using Pandas, Matplotlib and Seaborn

In [4]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
x = [1,4,7]
y = [1,6,9]

plt.plot(x,y)
plt.show()

#plt.title("Test Plot")

In [None]:
df= pd.DataFrame({'xval': range(1,11), 'yval': np.random.randn(10)})

# plot
plt.plot('xval','yval', data=df)
plt.show()


### [Lineplot](https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.plot.html)

In [None]:
StudentID = [192,193,194,195,196,197,198,199,200,201]
Scores = [9.2,8.6,8,7.2,6.9,7,6.5,8.2,7.5,6.3]


Data = {'StudentID' : [192,193,194,195,196,197,198,199,200,201], 'Scores' : [9.2,8.6,8,7.2,6.9,7,6.5,8.2,7.5,6.3]}
df = pd.DataFrame(Data,columns=['StudentID','Scores']) 

plt.plot(StudentID,Scores)


plt.plot(StudentID,Scores, color='red', marker='o')
plt.title('Exam Scores', fontsize=14)
plt.xlabel('Student ID', fontsize=14)
plt.ylabel('Scores', fontsize=14)
plt.grid(True)
plt.show()



In [None]:
StudID = [192,193,194,195,196,197,198,199,200,201]
Theory_Scores = [9.2,8.6,8,7.2,6.9,7,6.5,8.2,7.5,6.3]

StudID = [192,193,194,195,196,197,198,199,200,201]
Prac_Scores = [9,8.2,7.5,7,7.5,6.8,7,8.6,7.9,6.5]

plt.plot(StudID, Theory_Scores, label ="Theory Scores", color='red', marker='o')
plt.plot(StudID, Prac_Scores, label ="Practical Scores", color='Blue', marker='x')


plt.title('Exam Scores', fontsize=14)
plt.xlabel('Student ID', fontsize=14)
plt.ylabel('Scores', fontsize=14)
plt.legend()
plt.grid(True)
plt.show()

## Linspace 

**Tool in Python for creating numeric sequences. Linspace creates Sequences of Evenly spaced values within an Interval.**

**Read More here : [Linspace](https://numpy.org/doc/stable/reference/generated/numpy.linspace.html)**

In [None]:
plt.figure(figsize=(10,5))

x = np.linspace(0, 10, 100) 
y = np.sin(x) # Sine Graph
plt.grid(True)

plt.plot(x,y,'-r')
plt.xlabel("X - Axis")
plt.ylabel("Y - Axis")


plt.show()

<div class="alert alert-success"> You can use the following commands to change your plots 
    
<br> 
    
<li> 'b-' - Solid Blue Line
<li> 'r^' - Red Traingles
<li> 'go' - Green Dots 
<li> 'ro' - Red Dots
<li> 'rv' - Reverse Traingles
    
<br>
    
<b> Try your own plots using these different combinations </b>

</div>

In [None]:
plt.figure(figsize=(14,6))
plt.grid(True)
x = np.linspace(0, 10, 100) 
y1 = np.sin(x) 
y2 = np.cos(x)

plt.plot(x,y1, linewidth =4, label='SinX')
plt.plot(x,y2, linewidth =4, label='CosX')
plt.legend(loc = 'upper right')
plt.xlabel("X - Axis")
plt.ylabel("Y - Axis") 
plt.show()


plt.figure(figsize = (12,6))
plt.subplot(1,2,1)
plt.plot(x,y1,'-r')

plt.subplot(1,2,2)
plt.plot(x,y2,'-b')

plt.show()

### [SubPlots](https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.subplot.html?highlight=subplot#matplotlib.pyplot.subplot)

In [None]:
plt.style.use(['dark_background'])

plt.figure(figsize = (12,6))
plt.subplot(1,2,1)
plt.plot(x,y1,'-r')
plt.grid(True)
plt.subplot(1,2,2)
plt.plot(x,y2,'-b')
plt.grid(True)

## Customise your charts further using [Style Sheets](https://matplotlib.org/tutorials/introductory/customizing.html)

In [None]:
plt.style.use('seaborn-darkgrid')


x1= ['Australia','Hungary','Japan'] 
y1= np.array([3,3,2])
y2 =np.array([4,2,2])
y3 =np.array([3,2,3])

plt.figure(figsize=(6,8))
plt.bar(x1,y1,label = "Gold Medals",width = 0.5,color = '#0040ff')
plt.bar(x1,y2,label = "Silver Medals",width = 0.5 ,bottom = y1 , color = '#0080ff')
plt.bar(x1,y3,label = "Bronze Medals",width = 0.5 ,bottom = y1+y2 , color = '#00bfff')
plt.xlabel('$ Countries $')
plt.ylabel('$ Medals $')
plt.title("Medal Summary")
plt.legend()

plt.grid(linestyle='-', linewidth=0.5)

plt.show()

In [None]:
Age = [28,33,43,45,55]
Name = ["Joe", "Priya", 'Donna', "Shankar", "Mo"]
plt.barh(Name,Age, color ="#FF6F00")
plt.show()


In [None]:
x1 = ["Joe", "Priya", 'Donna', "Shankar", "Mo"]

y1 = [9,7,8,5,6]
y2 = [4,3,1,2,9]

plt.figure (figsize = (8,4))


plt.barh(x1,y1,label = "Fiction",color = '#ff00bf')
plt.barh(x1,y2,label = "Non Fiction", left = y1 , color = '#bf00ff')

plt.title('Books Read')
plt.legend()

plt.show()

### Let us now upload a Dataset to continue creating charts using Matplotlib and Pandas Visualisation

In [5]:
ign = pd.read_csv('../Datasets/data_ign_scores.csv',encoding = 'unicode_escape')
#https://www.kaggle.com/alexisbcook/data-for-datavis?select=ign_scores.csv
ign.head()

Unnamed: 0,Platform,Action,"Action, Adventure",Adventure,Fighting,Platformer,Puzzle,RPG,Racing,Shooter,Simulation,Sports,Strategy
0,DrCast,6.882857,7.511111,6.281818,8.2,8.34,8.088889,7.7,7.0425,7.616667,7.628571,7.272222,6.433333
1,GBoyAd,6.373077,7.507692,6.057143,6.226316,6.970588,6.532143,7.542857,6.657143,6.444444,6.928571,6.694444,7.175
2,GBoyC,6.272727,8.166667,5.307692,4.5,6.352941,6.583333,7.285714,5.897436,4.5,5.9,5.790698,7.4
3,Gcube,6.532584,7.608333,6.753846,7.422222,6.665714,6.133333,7.890909,6.852632,6.981818,8.028571,7.481319,7.116667
4,Nin3DS,6.670833,7.481818,7.414286,6.614286,7.503448,8.0,7.719231,6.9,7.033333,7.7,6.388889,7.9


In [None]:
plt.figure (figsize = (20,4))

plt.scatter(ign ['Platform'], ign['Action'],color ="#ff4d4d")

plt.title('Ign Scroes')
plt.xlabel('$ Platform $ ')
plt.ylabel('$ Action Ratings $')

In [None]:
plt.figure(figsize=(20,6))

# "alpha" is used for softnening colors

plt.rcParams['text.color'] = 'red' # Label Color
plt.scatter(ign['Platform'], ign['Action'],c='r', s=50 , alpha=0.8 , label = 'Action' )
plt.scatter(ign['Platform'], ign['Fighting'],c='b', s=100 , alpha=0.8 , label = 'Fighting')
plt.scatter(ign['Platform'], ign['Racing'],c='g', s=150 , alpha=0.8 , label = 'Racing')
plt.scatter(ign['Platform'], ign['Puzzle'],c='y', s=200 , alpha=0.8 , label = 'Puzzle')
plt.legend(bbox_to_anchor=(1.0, 1.0) , shadow=True, fontsize='x-large')

plt.show()

##  Refer this link for more information on customising your legends [Legend Guide](https://matplotlib.org/3.1.1/tutorials/intermediate/legend_guide.html)

#### Creating Histograms can be done by using the *Hist*  command, for Categorical data

In [None]:
fig, ax = plt.subplots()
# plot histogram
ax.hist(ign['Fighting'])


ax.set_xlabel('Points')
ax.set_ylabel('Frequency')

## Pandas Visualization

### Pandas Visualization makes it really easy to create plots out of a pandas dataframe and series. It also has a higher level API than Matplotlib and therefore we need less code

In [None]:
ign.plot.hist(subplots=True, layout=(4,4), figsize=(20, 20), bins=20)

plt.show()

In [None]:
ign.groupby("Platform").Racing.mean().sort_values(ascending=False)[:15].plot.bar(color ='#4D1A3B')

In [None]:
ign['Strategy'].plot.hist()

In [None]:
df = pd.DataFrame({'count': {0: 500, 1: 600, 2: 726, 3: 326, 4: 410}})

ax = df.T.plot(kind='bar', color=['C0', 'C1', 'C2', 'C3', 'C4'])

plt.show()

## Area Plots

Area charts are commonly used to showcase data that depicts a time-series relationship


### You can create area plots with Series or Data Frames. Area plots are stacked by default.   <br> 

<li> For Area Plots, column must either have all positive or all negative values.

<li> When input data contains NaN, it will be automatically filled by 0. If you want to drop or fill by different values, use dataframe.dropna() or dataframe.fillna() before calling plot.

In [None]:
df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])

df.plot.area();

In [None]:
df.plot.area(stacked =False)

## Hexagonal Bin Plots <br>

### <li> These plots can be a useful alternative to scatter plots if your data are too dense to plot each point individually.

In [None]:
n = 10000
df = pd.DataFrame({'x': np.random.randn(n),
                   'y': np.random.randn(n)})
ax = df.plot.hexbin(x='x', y='y', gridsize=25)

<li> By default, a histogram of the counts around each (x, y) point is computed. <br> 

<li> You can specify alternative aggregations by passing values to the C and reduce_C_function arguments. C specifies the value at each (x, y) point and reduce_C_function is a function of one argument that reduces all the values in a bin to a single number (e.g. mean, max, sum, std). In this example the positions are given by columns a and b, while the value is given by column z. The bins are aggregated with NumPy’s max function.


numpy.random.uniform(low=0.0, high=1.0, size=None)
Draw samples from a uniform distribution.

In [None]:
df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b'])

df['b'] = df['b'] + np.arange(1000)
df['z'] = np.random.uniform(0, 3, 1000)

df.plot(kind='hexbin', x='a', y='b', C='z', reduce_C_function=np.max,
       gridsize=20, color = 'red')


## [Useful Guide for Specifying Colors](https://matplotlib.org/3.1.1/tutorials/colors/colors.html)

# Seaborn


### <li> Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.



In [None]:
import seaborn as sns

mpl.rcParams.update(mpl.rcParamsDefault)
%matplotlib inline

In [None]:
cars = pd.read_csv('../Datasets/car_crashes.csv',encoding = 'unicode_escape')
#https://www.kaggle.com/fivethirtyeight/fivethirtyeight-bad-drivers-dataset

In [None]:
cases = pd.read_csv('../Datasets/data_2020-Sep-07%20(1).csv',encoding = 'unicode_escape')
#https://coronavirus.data.gov.uk/cases

In [None]:
insurance = pd.read_csv('../Datasets/insurance.csv',encoding = 'unicode_escape')
#

In [None]:
cars.describe()

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=cars['speeding'],linewidth = 1.5 , label = 'Speeding')
sns.lineplot(data=cars['alcohol'],linewidth = 1.5 , label = 'Alcohol')
sns.lineplot(data=cars['not_distracted'],linewidth = 1.5 , label = 'Not Distracted') 
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x="speeding", y="alcohol", data=cars, color="Blue")
plt.title("collisions due to speeding across states\n", fontsize=17)
plt.show()

In [None]:
coffee = pd.read_csv('../Datasets/Coffee_and_code.csv',encoding = 'unicode_escape')
#https://www.kaggle.com/devready/coffee-and-code
coffee.head()

In [None]:
coffee.columns

## Pairplot <br>

<li> Plot pairwise relationships in a dataset.

<li> By default, this function will create a grid of Axes such that each numeric variable in data will by shared in the y-axis across a single row and in the x-axis across a single column. The diagonal Axes are treated differently, drawing a plot to show the univariate distribution of the data for the variable in that column.  <br>
    

   [Read More here](https://seaborn.pydata.org/generated/seaborn.pairplot.html)

In [None]:
sns.set_style("darkgrid")
sns.pairplot(coffee,height=4)


## Catplots <br>

<li> This function provides access to several axes-level functions that show the relationship between a numerical and one or more categorical variables using one of several visual representations. <br> 
    
 <b> Refer this link for [More Info](https://seaborn.pydata.org/generated/seaborn.catplot.html) </b>

The kind parameter selects the underlying axes-level function to use:

<b> Categorical scatterplots: </b>

<li> stripplot() (with kind="strip"; the default)
<li> swarmplot() 

<b> Categorical distribution plots: </b>

<li> boxplot() 
<li> violinplot() 
<li> boxenplot()

<b> Categorical estimate plots: </b>

<li> pointplot()
<li> barplot() 
<li> countplot()

In [None]:
sns.catplot(x="AgeRange",y="CoffeeCupsPerDay",data=coffee,hue="Gender",aspect=2,kind="point")

In [None]:
plt.figure(figsize=(15,5))

sns.countplot(coffee["CoffeeTime"], palette=sns.color_palette("rainbow",7))

In [None]:
plt.figure(figsize=(15,5))

sns.boxplot(x="CoffeeTime",y="CoffeeCupsPerDay",data=coffee)
plt.show()

### [Understanding Box Plots](https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51)

In [None]:
sns.set(rc={"figure.figsize":(10,5)},style="darkgrid")

sns.stripplot(x="AgeRange", y="CoffeeCupsPerDay", data=coffee)

In [None]:
plt.figure(figsize=(16,7))
sns.violinplot(x= insurance.region , y = insurance.expenses , hue= insurance.smoker , palette="Set2")
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.violinplot(x= coffee.CoffeeType, y = coffee.CoffeeCupsPerDay , hue= coffee.Gender , palette="bright")
plt.show()

In [None]:
sns.boxenplot(x=insurance.smoker , y = insurance.expenses ,palette="Set1")
plt.show()

 ## [Choosing Color Palettes in Seaborn](https://seaborn.pydata.org/tutorial/color_palettes.html)

## FacetGrid

[More Info:](https://seaborn.pydata.org/generated/seaborn.FacetGrid.html?highlight=facetgrid#seaborn.FacetGrid)

In [None]:
i = sns.FacetGrid(insurance, col="region",height=4)
i = i.map(plt.hist,"age", linewidth=2)

In [None]:
i = sns.FacetGrid(insurance, col="region" , row = "sex" , hue="smoker" ,height=4, aspect=1)
i = i.map(plt.scatter, "bmi" , "expenses" , edgecolor="w",s=80)
i.add_legend()

### PairGrid 

In [None]:
ca = sns.PairGrid(cases , hue='areaName' ,vars=["newCasesBySpecimenDate" , "cumCasesBySpecimenDate"],height=6, aspect=2)
ca = ca.map_offdiag(plt.scatter , edgecolor="w", s=130)
ca = ca.map_diag(plt.hist , edgecolor ='w', linewidth=2)
ca = ca.add_legend()


## KDE Plot

KDE Plot described as Kernel Density Estimate is used for visualizing the Probability Density of a continuous variable. It depicts the probability density at different values in a continuous variable

In [None]:

sns.kdeplot(insurance.bmi,insurance.expenses,shade=True,cmap="Blues", shade_lowest=False)
plt.show()

### [Cmap Reference](https://matplotlib.org/examples/color/colormaps_reference.html)

## HeatMap

In [None]:
data = np.random.rand(4, 6)

heat_map = sns.heatmap(data)

In [None]:
flights= pd.read_csv('../Datasets/flights.csv',encoding = 'unicode_escape')
#https://github.com/mwaskom/seaborn-data/blob/master/flights.csv


In [None]:
flights = flights.pivot("month", "year", "passengers")
ax = sns.heatmap(flights) 

In [None]:
ax = sns.heatmap(flights, linewidths=.5,annot=True, fmt='d') 

In [None]:
ax = sns.heatmap(flights, linewidths=0.5, cmap = 'cubehelix')

In [None]:
data = np.random.randn(50, 20)
ax = sns.heatmap(data, xticklabels=2, yticklabels=False)

In [6]:
music = pd.read_csv('../Datasets/data_by_genres.csv')
#https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks?select=data_by_genres.csv