# 6.2 Exploring relationships

1. Importing Visualization Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib


In [None]:
#Charts to display 
%matplotlib inline

In [None]:
# Import data

df_cwur = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'cwurData.csv'), index_col = False)

In [None]:
df_cwur.head()

# Correlations:

In [None]:
# Create a subset excluding the "institution" and "country" columns

sub = df_cwur[['world_rank', 'national_rank', 'quality_of_education', 'alumni_employment', 'quality_of_faculty', 'publications', 'influence', 'citations', 'broad_impact', 'patents', 'score', 'year'
]]

In [None]:
# Create a correlation matrix using pandas

sub.corr()

In [None]:
# Create a correlation heatmap using matplotlib

plt.matshow(sub.corr())
plt.show()

In [None]:
# Save figure
plt.matshow(sub.corr())
plt.savefig("correlation 1 matshow.png") 

In [None]:
# Add labels, a legend, and change the size of the heatmap

f = plt.figure(figsize=(8, 8)) # figure size 
plt.matshow(sub.corr(), fignum=f.number) # type of plot
plt.xticks(range(sub.shape[1]), sub.columns, fontsize=14, rotation=45) # x axis labels
plt.yticks(range(sub.shape[1]), sub.columns, fontsize=14) # y axis labels
cb = plt.colorbar() # add a colour legend (called colorbar)
cb.ax.tick_params(labelsize=14) # add font size
plt.title('Correlation Matrix', fontsize=14) # add title

# Create a correlation heatmap using seaborn:

In [None]:
# Create a subplot with matplotlib
f,ax = plt.subplots(figsize=(10,10))

# Create the correlation heatmap in seaborn 
corr = sns.heatmap(sub.corr(), annot = True, ax = ax) 

In [None]:
plt.show()

In [None]:
plt.savefig("correlation 2 seaborn.png") 

 From the heatmap we can see that most of the variables have positive correlation with each other except for the Score - it has negative corrrelation with all the varibles and the strongest is with quality of faculty(-0.69), it's also quite strong with quality of education and world rank. The highest positive correlations are found among Publications, Influence, Citations, Broad Impact, and World Rank, all of which have correlation coefficients of 0.83 or higher.

# Scatterplots:

Create a scatterplot (or plots) for the variables with the strongest correlations

In [None]:
sns.lmplot(x = 'publications', y = 'citations', data = sub)
plt.show()

In [None]:
sns.lmplot(x = 'publications', y = 'influence', data = sub)
plt.show()

In [None]:
sns.lmplot(x = 'publications', y = 'broad_impact', data = sub)
plt.show()

In [None]:
sns.lmplot(x = 'citations', y = 'influence', data = sub)
plt.show()

In [None]:
sns.lmplot(x = 'citations', y = 'broad_impact', data = sub)
plt.show()

In [None]:
sns.lmplot(x = 'influence', y = 'broad_impact', data = sub)
plt.show()

In [None]:
plt.savefig("scatter_world rank_score.png") 

In [None]:
# Create a scatterplot for the "world rank " and "publications" columns in seaborn

sns.lmplot(x = 'world_rank', y = 'publications', data = sub)
plt.show()

In [None]:
plt.savefig("scatter_world rank_publications.png") 

In [None]:
# Create a scatterplot for the "world rank " and "influence" columns in seaborn

sns.lmplot(x = 'world_rank', y = 'influence', data = sub)
plt.show()
plt.savefig("scatter_world rank_influence.png") 

In [None]:
# Create a scatterplot for the "world rank " and "citations" columns in seaborn

sns.lmplot(x = 'world_rank', y = 'citations', data = sub)
plt.show()
plt.savefig("scatter_world rank_citations.png") 

In [None]:
# Create a scatterplot for the "world rank " and "broad_impact" columns in seaborn

sns.lmplot(x = 'world_rank', y = 'broad_impact', data = sub)
plt.show()
plt.savefig("scatter_world rank_broad_impact.png") 

In [None]:
# Create a scatterplot for the "world rank " and "publications" columns in seaborn

sns.lmplot(x = 'world_rank', y = 'score', data = sub)
plt.show()

According to the scatter plots we can see that there is a linear and strong correlation especially between world rank and publications/influence/broad_impact. There are some outliners that can investigated further.

# Pair Plots:

In [None]:
# Create a pair plot 

g = sns.pairplot(df_cwur)
plt.show()


The closest correlations were confirmed (world rank/publications/influence/broad_impact)


# Categorical Plots:

In [None]:
sns.histplot(df_cwur['publications'], bins = 20, kde = True)

In [None]:
plt.show()

In [None]:
df_cwur.loc[df_cwur['publications'] <50, 'Publications level'] = 'Few publications'

In [None]:
df_cwur.loc[(df_cwur['publications'] >= 50) & (df_cwur['publications'] <150), 'Publications level'] = 'Medium publications'

In [None]:
df_cwur.loc[df_cwur['publications'] >=150, 'Publications level'] = 'High publications'

In [None]:
df_cwur['Publications level'].value_counts(dropna = False)

In [None]:
# Create a categorical plot in seaborn using the publications created above

sns.set(style="ticks")
g = sns.catplot(x="world_rank", y="publications", hue="Publications level", data=df_cwur)

In [None]:
plt.show()

In [None]:
# Create a categorical plot in seaborn using the publications created above

sns.set(style="ticks")
g = sns.catplot(x="score", y="publications", hue="Publications level", data=df_cwur)

In [None]:
plt.show()

In [None]:
sns.histplot(df_cwur['score'], bins = 50, kde = True)
plt.show()

In [None]:
df_cwur.loc[df_cwur['score'] <50, 'Score level'] = 'law'

In [None]:
df_cwur.loc[(df_cwur['score'] >= 50) & (df_cwur['score'] <60), 'Score level'] = 'Medium'

In [None]:
df_cwur.loc[df_cwur['score'] >=60, 'Score level'] = 'High'

In [None]:
df_cwur['Score level'].value_counts(dropna = False)

In [None]:


g = sns.catplot(x="world_rank", y="score", hue="Score level", data=df_cwur, height=8, aspect=2)

# Show only every 50th label 
plt.xticks(ticks=np.arange(0, len(df_cwur['world_rank']), step=50), rotation=90)  
plt.show()


Exploring relationships can help to answer the following question- What factors (research output, faculty quality, citations, etc) most influence university rankings?

Acoording to the analysis above it was revealed that the highest positive correlations are found among Publications, Influence, Citations, Broad Impact, and World Rank, all of which have correlation coefficients of 0.83 or higher. However after some research I've found out Why is the correlation between score and factors lower than with world_rank? The score is made up of several factors (like research output, faculty quality, etc.), and each of these factors is weighted differently. Because it’s an average of many factors, small changes in one factor might not cause a big change in the overall score. This is why the correlation between the score and each factor is lower.

The world_rank shows a university's relative position compared to others (like 1st, 2nd, 3rd). A small change in a factor, like publications, can push a university's rank up or down, making the correlation with world_rank stronger. This is because ranks are ordinal (they represent a position) and even small differences in performance can lead to a larger shift in rank.
