# Loading Packages and Functions

In [1]:
import pandas as pd
import numpy as num
import altair as alt
import vegafusion
import matplotlib
from sklearn.model_selection import train_test_split

# Reading Data

In [2]:
# Reading the dataset
dataset = pd.read_csv("https://osf.io/download/g72pq/", sep=',', on_bad_lines='skip', low_memory=False)

# Creating a list of columns related to questions to drop
columns_to_drop = [col for col in dataset.columns if col.startswith('q')]

# Dropping the question columns from the DataFrame
dataset = dataset.drop(columns_to_drop, axis=1)

# Dropping the unrelavante columns from the DataFrame
dataset = dataset.drop("elogit", axis=1)
df = dataset.drop("tests", axis=1)
dataset = dataset.iloc[:,4:]

dataset

Unnamed: 0,gender,age,natlangs,primelangs,dyslexia,psychiatric,education,tests,Eng_start,Eng_country_yrs,...,Ebonics,Ir_region,UK_constituency,nat_Eng,prime_Eng,speaker_cat,type,Lived_Eng_per,Eng_little,correct
0,male,31,English,English,0,0,Graduate Degree,",WhichEnglish",0,,...,0,,,1,1,native,United States,,monoeng,0.978947
1,male,30,English,English,0,1,Undergraduate Degree (3-5 years higher ed),",WhichEnglish",0,,...,,,HP,1,1,native,,,monoeng,1.000000
2,male,30,English,English,0,1,Undergraduate Degree (3-5 years higher ed),",WhichEnglish",0,,...,,,HP,1,1,native,,,monoeng,1.000000
3,male,19,English,English,0,0,Some Undergrad (higher ed),",WhichEnglish",0,,...,1,,,1,1,native,Ebonics,,monoeng,0.905263
4,male,20,"Chinese/Mandarin, English","Chinese/Mandarin, English",0,0,High School Degree (12-13 years),",WhichEnglish",0,,...,,,,1,1,native,,,bileng,0.947368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669493,female,22,Russian,Russian,0,0,Graduate Degree,",WhichEnglish",7,0.0,...,,,,0,0,late,Russian,0.000000,little,0.831579
669494,female,22,Polish,"English, Polish",0,0,Graduate Degree,",WhichEnglish",6,0.0,...,,,none,0,0,late,Polish,0.000000,little,0.968421
669495,female,16,,,0,0,High School Degree (12-13 years),",WhichEnglish",5,0.0,...,,,,0,0,foreign,,0.000000,little,0.926316
669496,male,27,"Chinese/Mandarin, English, Japanese","Chinese/Mandarin, English",0,0,Graduate Degree,",WhichEnglish",8,14.0,...,0,,,1,1,late,,0.736842,,0.915789


In [3]:
df = dataset

# Splitting the DataFrame
train_df, test_df = train_test_split(df, test_size=0.3, random_state=123)

df

Unnamed: 0,gender,age,natlangs,primelangs,dyslexia,psychiatric,education,tests,Eng_start,Eng_country_yrs,...,Ebonics,Ir_region,UK_constituency,nat_Eng,prime_Eng,speaker_cat,type,Lived_Eng_per,Eng_little,correct
0,male,31,English,English,0,0,Graduate Degree,",WhichEnglish",0,,...,0,,,1,1,native,United States,,monoeng,0.978947
1,male,30,English,English,0,1,Undergraduate Degree (3-5 years higher ed),",WhichEnglish",0,,...,,,HP,1,1,native,,,monoeng,1.000000
2,male,30,English,English,0,1,Undergraduate Degree (3-5 years higher ed),",WhichEnglish",0,,...,,,HP,1,1,native,,,monoeng,1.000000
3,male,19,English,English,0,0,Some Undergrad (higher ed),",WhichEnglish",0,,...,1,,,1,1,native,Ebonics,,monoeng,0.905263
4,male,20,"Chinese/Mandarin, English","Chinese/Mandarin, English",0,0,High School Degree (12-13 years),",WhichEnglish",0,,...,,,,1,1,native,,,bileng,0.947368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669493,female,22,Russian,Russian,0,0,Graduate Degree,",WhichEnglish",7,0.0,...,,,,0,0,late,Russian,0.000000,little,0.831579
669494,female,22,Polish,"English, Polish",0,0,Graduate Degree,",WhichEnglish",6,0.0,...,,,none,0,0,late,Polish,0.000000,little,0.968421
669495,female,16,,,0,0,High School Degree (12-13 years),",WhichEnglish",5,0.0,...,,,,0,0,foreign,,0.000000,little,0.926316
669496,male,27,"Chinese/Mandarin, English, Japanese","Chinese/Mandarin, English",0,0,Graduate Degree,",WhichEnglish",8,14.0,...,0,,,1,1,late,,0.736842,,0.915789


## Handling the NaN values

In [4]:
# number of NaN values in each column
nan_counts = df.isna().sum()
print(nan_counts)

gender                       0
age                          0
natlangs                  8018
primelangs                7822
dyslexia                     0
psychiatric                  0
education                    0
tests                        0
Eng_start                    0
Eng_country_yrs         289420
house_Eng               289420
dictionary                   0
already_participated         0
countries                29591
currcountry              33889
US_region               500277
UK_region               572240
Can_region              620199
Ebonics                 500383
Ir_region               657641
UK_constituency         576572
nat_Eng                      0
prime_Eng                    0
speaker_cat                  0
type                    271943
Lived_Eng_per           300756
Eng_little              111233
correct                      0
dtype: int64


In [5]:
# Replacing NaN values for the numeric columns with zero
df_numeric_filled = df.select_dtypes(include='number').fillna(0)
df.update(df_numeric_filled)

# Dropping the other columns with plenty on NaN values (except natlangs and primelangs)
exclude_columns = ['natlangs', 'primelangs']
columns_to_drop = df.columns[df.isna().any() & ~df.columns.isin(exclude_columns)]
df = df.drop(columns=columns_to_drop)

# dropping rows with NaN values in natlangs and primelangs columns
df = df.dropna()

df

Unnamed: 0,gender,age,natlangs,primelangs,dyslexia,psychiatric,education,tests,Eng_start,Eng_country_yrs,house_Eng,dictionary,already_participated,nat_Eng,prime_Eng,speaker_cat,Lived_Eng_per,correct
0,male,31,English,English,0,0,Graduate Degree,",WhichEnglish",0,0.0,0.0,0,0,1,1,native,0.000000,0.978947
1,male,30,English,English,0,1,Undergraduate Degree (3-5 years higher ed),",WhichEnglish",0,0.0,0.0,0,0,1,1,native,0.000000,1.000000
2,male,30,English,English,0,1,Undergraduate Degree (3-5 years higher ed),",WhichEnglish",0,0.0,0.0,0,0,1,1,native,0.000000,1.000000
3,male,19,English,English,0,0,Some Undergrad (higher ed),",WhichEnglish",0,0.0,0.0,0,0,1,1,native,0.000000,0.905263
4,male,20,"Chinese/Mandarin, English","Chinese/Mandarin, English",0,0,High School Degree (12-13 years),",WhichEnglish",0,0.0,0.0,0,0,1,1,native,0.000000,0.947368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669492,female,20,English,English,0,0,Some Undergrad (higher ed),",WhichEnglish",0,0.0,0.0,0,0,1,1,native,0.000000,0.989474
669493,female,22,Russian,Russian,0,0,Graduate Degree,",WhichEnglish",7,0.0,0.0,0,0,0,0,late,0.000000,0.831579
669494,female,22,Polish,"English, Polish",0,0,Graduate Degree,",WhichEnglish",6,0.0,0.0,0,0,0,0,late,0.000000,0.968421
669496,male,27,"Chinese/Mandarin, English, Japanese","Chinese/Mandarin, English",0,0,Graduate Degree,",WhichEnglish",8,14.0,1.0,0,0,1,1,late,0.736842,0.915789


In [6]:
# number of NaN values in each column
nan_counts = df.isna().sum()
print(nan_counts)

gender                  0
age                     0
natlangs                0
primelangs              0
dyslexia                0
psychiatric             0
education               0
tests                   0
Eng_start               0
Eng_country_yrs         0
house_Eng               0
dictionary              0
already_participated    0
nat_Eng                 0
prime_Eng               0
speaker_cat             0
Lived_Eng_per           0
correct                 0
dtype: int64


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 661480 entries, 0 to 669497
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   gender                661480 non-null  object 
 1   age                   661480 non-null  int64  
 2   natlangs              661480 non-null  object 
 3   primelangs            661480 non-null  object 
 4   dyslexia              661480 non-null  int64  
 5   psychiatric           661480 non-null  int64  
 6   education             661480 non-null  object 
 7   tests                 661480 non-null  object 
 8   Eng_start             661480 non-null  int64  
 9   Eng_country_yrs       661480 non-null  float64
 10  house_Eng             661480 non-null  float64
 11  dictionary            661480 non-null  int64  
 12  already_participated  661480 non-null  int64  
 13  nat_Eng               661480 non-null  int64  
 14  prime_Eng             661480 non-null  int64  
 15  speak

In [8]:
df.describe()

Unnamed: 0,age,dyslexia,psychiatric,Eng_start,Eng_country_yrs,house_Eng,dictionary,already_participated,nat_Eng,prime_Eng,Lived_Eng_per,correct
count,661480.0,661480.0,661480.0,661480.0,661480.0,661480.0,661480.0,661480.0,661480.0,661480.0,661480.0,661480.0
mean,29.977153,0.0,0.030795,5.144555,2.507642,0.109772,0.0,0.0,0.440875,0.439257,0.080307,0.937781
std,11.263294,0.0,0.172761,5.696674,7.976607,0.312606,0.0,0.0,0.496492,0.496297,0.234211,0.060947
min,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063158
25%,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.915789
50%,27.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.957895
75%,35.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.978947
max,89.0,0.0,1.0,74.0,88.0,1.0,0.0,0.0,1.0,1.0,1.1,1.0


# EDA


## Column data types and first few rows

In [9]:
data_types = train_df.dtypes 
first_few_rows = train_df.head()  

print(data_types)
first_few_rows

gender                   object
age                       int64
natlangs                 object
primelangs               object
dyslexia                  int64
psychiatric               int64
education                object
tests                    object
Eng_start                 int64
Eng_country_yrs         float64
house_Eng               float64
dictionary                int64
already_participated      int64
countries                object
currcountry              object
US_region                object
UK_region                object
Can_region               object
Ebonics                  object
Ir_region                object
UK_constituency          object
nat_Eng                   int64
prime_Eng                 int64
speaker_cat              object
type                     object
Lived_Eng_per           float64
Eng_little               object
correct                 float64
dtype: object


Unnamed: 0,gender,age,natlangs,primelangs,dyslexia,psychiatric,education,tests,Eng_start,Eng_country_yrs,...,Ebonics,Ir_region,UK_constituency,nat_Eng,prime_Eng,speaker_cat,type,Lived_Eng_per,Eng_little,correct
413443,female,35,Hungarian,"French , Hungarian",0,0,Some Undergrad (higher ed),",WhichEnglish",8,0.0,...,,,,0,0,late,Hungarian,0.0,little,0.926316
385270,male,24,Polish,Polish,0,0,Graduate Degree,",WhichEnglish",16,0.0,...,,,,0,0,late,Polish,0.0,little,0.726316
444091,female,29,English,"English, Japanese",0,1,Undergraduate Degree (3-5 years higher ed),",WhichEnglish",0,,...,,,,1,1,native,,,monoeng,0.947368
21235,male,49,Finnish,Finnish,0,0,Undergraduate Degree (3-5 years higher ed),",WhichEnglish",9,0.0,...,,,,0,0,late,Finnish,0.0,little,0.968421
39273,male,23,"Chinese/Hokkien, Chinese/Mandarin, English","Chinese/Hokkien, English",0,0,Graduate Degree,",WhichEnglish",0,,...,,,,1,1,native,,,bileng,0.768421


## Distributions of Numeric Columns

In [19]:
alt.data_transformers.enable('vegafusion')

numeric_cols = train_df.select_dtypes(include='number').columns.tolist()

numeric_cols_dist = alt.Chart(train_df).mark_bar().encode(
    x=alt.X(alt.repeat("repeat"), type="quantitative"),
    y="count()"
).properties(
    width=250,
    height=150
).repeat(
    repeat=numeric_cols,
    columns=3
)

numeric_cols_dist

In [15]:
splom = alt.Chart(train_df).mark_point(opacity = 0.4, size = 10).encode(
    alt.X(alt.repeat('row'), type = 'quantitative'),
    alt.Y(alt.repeat('column'), type = 'quantitative')
).properties(
    width = 200,
    height = 140
).repeat(
    column = numeric_cols,
    row = numeric_cols
)
splom

## Correlation matrix

In [12]:
train_df.select_dtypes("number").corr('spearman').style.background_gradient()

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,age,dyslexia,psychiatric,Eng_start,Eng_country_yrs,house_Eng,dictionary,already_participated,nat_Eng,prime_Eng,Lived_Eng_per,correct
age,1.0,,-0.033521,0.050099,0.152968,0.07233,,,0.014748,0.013632,0.139886,0.09715
dyslexia,,,,,,,,,,,,
psychiatric,-0.033521,,1.0,0.003514,-0.016167,0.000474,,,0.005295,0.005265,-0.015314,-0.08606
Eng_start,0.050099,,0.003514,1.0,-0.301905,-0.303483,,,-0.81563,-0.812789,-0.250905,-0.463189
Eng_country_yrs,0.152968,,-0.016167,-0.301905,1.0,0.542939,,,0.106523,0.106523,0.995536,0.294189
house_Eng,0.07233,,0.000474,-0.303483,0.542939,1.0,,,0.114003,0.114003,0.52131,0.214195
dictionary,,,,,,,,,,,,
already_participated,,,,,,,,,,,,
nat_Eng,0.014748,,0.005295,-0.81563,0.106523,0.114003,,,1.0,0.996815,0.085984,0.405891
prime_Eng,0.013632,,0.005265,-0.812789,0.106523,0.114003,,,0.996815,1.0,0.085984,0.404297
