### In - class exercise - Altair library

In [1]:
# Importing the necessary packages.

import pandas as pd
import altair as alt

In [2]:
# Loading the dataset "topfiftyspotify". It contains 50 rows with 14 different features

spotify = pd.read_csv("topfiftyspotify.csv")
spotify.head()   # Display the first 5 observations

Unnamed: 0,Track ID,Track Name,Artist Name,Genre,Beats per min,Energy,Danceability,Loudness(dB),Liveness,Valence,Length,Acousticness,Speechiness,Popularity
0,1,Señorita,Shawn Mendes,canadian pop,117,55,76,-6,8,75,191,4,3,79
1,2,China,Anuel AA,reggaeton flow,105,81,79,-4,8,61,302,8,9,92
2,3,boyfriend (with Social House),Ariana Grande,dance pop,190,80,40,-4,16,70,186,12,46,85
3,4,Beautiful People (feat. Khalid),Ed Sheeran,pop,93,65,64,-8,8,55,198,12,19,86
4,5,Goodbyes (Feat. Young Thug),Post Malone,dfw rap,150,65,58,-4,11,18,175,45,7,94


### TASK 1: Let's show the acousticness for each song based in Track Name by visualizing it using a line graph. Show the variation in lines to show difference in acousticness for every track.

In [3]:
# Plot the graph and show the variations in line

Chart=alt.Chart(spotify).mark_trail().encode(            # mark_trail is used to vary the size of the line
    x='Track Name',
    y='Acousticness',
    size='Acousticness'  # For the variations in line size
).properties(
    title='Acousticness of the song based on Track Name'  # adding title to the chart
)


Chart.configure_title(  # chart title configuration 
    fontSize=20,
    font='Courier',
    anchor='middle',
    color='black'
)

### TASK 2: Show the relationship between each song and it's length along with the loudness of each song using bubble plot. 

In [4]:
# Table Bubble plot between track name and its length with loudness

Chart=alt.Chart(spotify).mark_circle().encode(        # mark_circle displays a scatter plot with filled circles
    x='Track Name',
    y='Length',
    size='Loudness(dB)'  # Loudness values shown with different bubble sizes
).properties(
    title='Track Name v/s length and loudness'  # chart title
)


Chart.configure_title(  # chart title configuration 
    fontSize=20,
    font='Courier',
    anchor='middle',
    color='black'
)

### TASK 3: We now want to show the popularity of each track by visualizing it on a bar chart with highlighted segments so that we can highlight values beyond a threshold. Show and list the track names whose popularity is less than 80 

In [5]:
threshold = pd.DataFrame([{"threshold": 80}])   # Decides the threshold value

bars = alt.Chart(spotify).mark_bar().encode(   # mark_bar will display a bar plot
    x="Track Name",
    y="Popularity",
).properties(
    title='Track Name v/s popularity for a threshold=80'  # chart title
)

highlight = alt.Chart(spotify).mark_bar(color="red").encode(
    x='Track Name',
    y='baseline:Q',
    y2='Popularity'   # Second parameter for the height of the plot
).transform_filter(
    alt.datum.Popularity > 80
).transform_calculate("baseline", "80")

highlight.properties(
    title='Track Name v/s popularity for a threshold=80'  # chart title
)

rule = alt.Chart(threshold).mark_rule().encode(    # mark_rule creates a vertical/horizontal line spanning the axis
    y='threshold:Q'
).properties(
    title='Track Name v/s popularity for a threshold=80'  # chart title
)

(bars + highlight + rule).properties(width=750)  # Concatenating all 3 variables to create the plot

(bars + highlight + rule).configure_title(
    fontSize=20,
    font='Courier',
    anchor='middle',
    color='black'
)

In [6]:
# List of track names with Popularity < 80

pop = spotify[spotify["Popularity"] < 80]
pop["Track Name"]

0                                              Señorita
25                                  If I Can't Have You
39    fuck, i'm lonely (with Anne-Marie) - from “13 ...
Name: Track Name, dtype: object

### TASK 4: Plot a chart that visualizes the Estimated salary distributed over Age and Gender. Use a slider widget to visualize these parameters over Credit score.

In [7]:
# Loading the dataset "Churn_Modelling". It contains 10000 rows with 14 different features


churn = pd.read_csv("Churn_Modelling.csv")
churn.head()    # Display the first 5 observations

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
1,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
2,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
3,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
4,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1


In [8]:
# For this visualization, we cannot exceed 5000 rows so we are randomly reducing this dataset to 5000 rows. 

smaller = churn.sample(frac=0.5, random_state = 1)  # frac takes a percentage value to split the data accordingly
smaller.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
4251,8615,15809515,Lewis,797,Germany,Male,32,1,151922.94,1,1,0,8877.06,0
200,399,15762218,Mills,701,France,Female,39,9,0.0,2,0,1,145894.9,0
3379,6829,15806134,Storey,707,Germany,Male,34,9,162691.16,2,1,0,94912.78,0
3016,6105,15750731,Trevisani,736,Germany,Male,50,9,116309.01,1,1,0,185360.4,1
4170,8441,15781127,Giordano,663,Spain,Female,33,8,96769.04,1,1,1,36864.05,0


In [9]:
pink_blue = alt.Scale(domain=('Male', 'Female'),
                      range=["steelblue", "salmon"])

slider = alt.binding_range(min=350, max=850, step=25)   # binding_range allows for selection along a scale
select_credit = alt.selection_single(name="CreditScore", fields=['CreditScore'],  # To select a single discrete data value
                                   bind=slider, init={'CreditScore': 650})

Chart=alt.Chart(smaller).mark_bar().encode(
    x=alt.X('Gender:N', title=None),
    y=alt.Y('EstimatedSalary:Q', scale=alt.Scale(domain=(0, 210000))),
    color=alt.Color('Gender:N', scale=pink_blue),
    column='Age:O'
).properties(
    width=30
).add_selection(
    select_credit
).transform_calculate(   # Used to define new fields which are calculated from other fields
    "Gender", alt.expr.if_(alt.datum.Gender == "Male", "Male", "Female")
).transform_filter(
    select_credit
).configure_facet(
    spacing=3
).properties(
    title='Estimated salary v/s age and gender for selected Credit Score'   # chart title
)


Chart.configure_title(   # chart title configuration 
    fontSize=15,
    font='Courier',
    anchor='middle',
    color='black'
)

Interpretation : As we skim through different Credit Scores on the slider, we can observe that females with a less credit score have minimum/no Estimated Salary at a younger age, but as we go higher along the age group, females start to have higher Estimated salaries compared to males. Eventhough, the Credit Score of 725 shows an unexpected rise in the salaries of females v/s males and a downfall in the salaries of males v/s females for 3 out of the 4 age groups.

### TASK 5: Let's plot a scatter plot for Age v/s CreditScore, which consists of a drop-down selection menu to visualize correlation between Age and Creditscore for different countries.

In [10]:
# For our next visualization, we will limit the data to 1000 rows  

again_smaller = churn.sample(frac=0.1, random_state = 1)  # frac will take a percentage value to split the data accordingly
again_smaller.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
4251,8615,15809515,Lewis,797,Germany,Male,32,1,151922.94,1,1,0,8877.06,0
200,399,15762218,Mills,701,France,Female,39,9,0.0,2,0,1,145894.9,0
3379,6829,15806134,Storey,707,Germany,Male,34,9,162691.16,2,1,0,94912.78,0
3016,6105,15750731,Trevisani,736,Germany,Male,50,9,116309.01,1,1,0,185360.4,1
4170,8441,15781127,Giordano,663,Spain,Female,33,8,96769.04,1,1,1,36864.05,0


In [11]:
input_dropdown = alt.binding_select(options=['Germany','Spain','France'])  # Creates a drop down box
                                                                           # for selecting a single item from a list
selection = alt.selection_single(fields=['Geography'], bind=input_dropdown, name='Country')
color = alt.condition(selection,
                    alt.Color('Geography:N', legend=None),
                    alt.value('lightgray'))

Chart=alt.Chart(again_smaller).mark_point().encode(
    x='Age:Q',
    y='CreditScore:Q',
    color=color,
    tooltip='Name:N'
).add_selection(
    selection
).transform_filter(
    selection
).properties(
    title='Age v/s CreditScore for specific Country'   # chart title
)


Chart.configure_title(   # chart title configuration 
    fontSize=15,
    font='Courier',
    anchor='middle',
    color='black'
)

What can you say about the correlation between Age and CreditScore for all 3 countries looking at the graph?

Interpretation : The point in the above scatter plot are randolmy distributed with no defined pattern.Hence, there is lack of predictablility in determining CreditScore from a given value of Age, and the associated amorphous, non-structured appearance of the scatter plot leads to the summary conclusion that there is little or no correlation between Age and CreditScore of an individual for all the three countries.