# Exploratory data analysis

In [81]:
import pandas as pd
df_univ = pd.read_csv("World University Rankings 2023.csv")
df_univ.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2341 entries, 0 to 2340
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   University Rank              2341 non-null   object 
 1   Name of University           2233 non-null   object 
 2   Location                     2047 non-null   object 
 3   No of student                2209 non-null   object 
 4   No of student per staff      2208 non-null   float64
 5   International Student        2209 non-null   object 
 6   Female:Male Ratio            2128 non-null   object 
 7   OverAll Score                1799 non-null   object 
 8   Teaching Score               1799 non-null   float64
 9   Research Score               1799 non-null   float64
 10  Citations Score              1799 non-null   float64
 11  Industry Income Score        1799 non-null   float64
 12  International Outlook Score  1799 non-null   float64
dtypes: float64(6), obj

In [82]:
df_univ.shape

(2341, 13)

In [83]:
df_univ.isna().sum()

University Rank                  0
Name of University             108
Location                       294
No of student                  132
No of student per staff        133
International Student          132
Female:Male Ratio              213
OverAll Score                  542
Teaching Score                 542
Research Score                 542
Citations Score                542
Industry Income Score          542
International Outlook Score    542
dtype: int64

Let's drop rows with NaN values.

In [84]:
df_univ = df_univ.dropna()
df_univ.shape

(1488, 13)

Some numerical variables are labelled as objects. Let's investigate further.

In [85]:
df_univ[["No of student", "International Student", "Female:Male Ratio", "OverAll Score"]].head()

Unnamed: 0,No of student,International Student,Female:Male Ratio,OverAll Score
0,20965,42%,48 : 52,96.4
1,21887,25%,50 : 50,95.2
2,20185,39%,47 : 53,94.8
3,16164,24%,46 : 54,94.8
4,11415,33%,40 : 60,94.2


* The column "No of student" needs to be converted to int.
* The column "international Student", containing percentages, can be converted to float.
* The Female:Male Ratio can be approached as a division of female by male.
* The OverAll Score will be converted to float type.

In [86]:
import numpy as np

df_univ["No of student"] = df_univ["No of student"].astype(str)

df_univ["No of student"] = df_univ["No of student"].str.replace(",", "")

# we need to replace empty strings with NaN to avoid type conversion issues.
df_univ["International Student"] = df_univ["International Student"].replace('', np.nan)

df_univ["No of student"] = df_univ["No of student"].astype(int)

df_univ["No of student"].head()

0    20965
1    21887
2    20185
3    16164
4    11415
Name: No of student, dtype: int64

In [87]:
df_univ["International Student"] = df_univ["International Student"].astype(str)

df_univ["International Student"] = df_univ["International Student"].str.replace('%', '').str.strip()

# we need to replace empty strings with NaN to avoid type conversion issues.
df_univ["International Student"] = df_univ["International Student"].replace('', np.nan)

df_univ["International Student"] = df_univ["International Student"].astype(float)

df_univ["International Student"].head()

0    42.0
1    25.0
2    39.0
3    24.0
4    33.0
Name: International Student, dtype: float64

In [88]:
print(df_univ["Female:Male Ratio"])

def length_values(str_ratio):
    return len(str_ratio)

print(df_univ["Female:Male Ratio"].apply(length_values).value_counts())

0       48 : 52
1       50 : 50
2       47 : 53
3       46 : 54
4       40 : 60
         ...   
1692    35 : 65
1693    38 : 62
1694    39 : 61
1695    55 : 45
1696    47 : 53
Name: Female:Male Ratio, Length: 1488, dtype: object
Female:Male Ratio
7    1485
6       3
Name: count, dtype: int64
