In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/kaggle/input/covid19-education-impact-survey/open_one_time_covid_education_impact.csv')

In [None]:
df.head()

# 1. Visualization

In [None]:
sns.countplot(y="gender", data=df)

In [None]:
sns.countplot(y="age", data=df)

In [None]:
sns.countplot(y="geography", data=df)

In [None]:
sns.countplot(y="financial_situation", data=df)

In [None]:
sns.countplot(y="education", data=df)

In [None]:
sns.countplot(y="employment_status", data=df)

In [None]:
sns.countplot(y="submission_state", data=df)

In [None]:
num_list=['are_there_children_0_to_2_yrs_out_of_educational_system',
       'were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school',
       'are_there_children_who_stopped_enrolling_in_primary_education',
       'are_there_children_who_stopped_enrolling_in_secondary_education',
       'are_children_attending_face_to_face_classes',
       'can_children_observe_deterioration_of_basic_services_of_school',
       'do_children_3_and_17_yrs_receive_regular_school_meals',
       'are_there_teachers_at_scheduled_class_hours',
       'are_children_3_to_17_yrs_dealing_with_irregular_school_activity',
       'are_children_being_teached_by_unqualified_people',
       'did_teachers_leave_the_educational_system',
       'do_school_and_the_teachers_have_internet_connection',
       'do_children_have_internet_connection',
       'do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity',
       'does_home_shows_severe_deficit_of_electricity',
       'does_home_shows_severe_deficit_of_internet',
       'do_children_3_to_17_yrs_miss_class_or_in_lower_grade',
       'are_children_promoted_with_a_modality_different_from_formal_evaluation']

In [None]:
fig = plt.figure(figsize=(20,20))

for i in range(len(num_list)):
    
    plt.subplot((len(num_list)/2), 2, i+1)
    plt.title(num_list[i])
    plt.hist(df[num_list[i]])

plt.tight_layout()

# 2. Clustering and PCA

# 1) Try to increase the features by 'get_dummies'

In [None]:
df=df.drop(['submission_date'], axis=1)

In [None]:
df1=pd.get_dummies(df)

In [None]:
df1

In [None]:
df1.columns.unique()

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(df1.corr())

In [None]:
from sklearn.cluster import KMeans
clust_df1=KMeans(n_clusters=4).fit_predict(df1.iloc[:,1:])
clust_df1=pd.DataFrame(clust_df1,columns=['cluster'])

In [None]:
df1['cluster']=clust_df1['cluster']
df1.groupby('cluster')['submission_id'].count().plot.bar()

In [None]:
df1.groupby('cluster').mean().iloc[:,1:].plot.bar(figsize=(15,10))
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0, fontsize=5)

In [None]:
from sklearn.decomposition import TruncatedSVD,PCA
pca=PCA(4)
pca_df1=pca.fit_transform(df1.iloc[:,1:])
pca_df1=pd.DataFrame(pca_df1,columns=['PC1','PC2','PC3','PC4'])
pca_df1['cluster']=df1['cluster']

In [None]:
pca_df1

In [None]:
sns.barplot(x=['PC1','PC2','PC3','PC4'],y=pca.explained_variance_ratio_)
sns.pointplot(x=['PC1','PC2','PC3','PC4'],y=np.cumsum(pca.explained_variance_ratio_),lw=5,legend=True,label='Cumulative',color='tab:red')
plt.ylabel('Explained Variance')
plt.show()

# It seems to be difficult to explain 4 components by simple 'get_dummies'

# 2) Try to map numbers to explain each features. For example, high education is 5 and no education is 0.

In [None]:
df=df.drop('submission_state', axis=1)

In [None]:
df['gender']=df['gender'].map({'Female':1, 'Male':2, 'Prefer not to answer':0, 'Non-Binary':0,'Not Available':0})
df['age']=df['age'].map({'Over 45 years old':5, '26 to 35 years old':3, '36 to 45 years old':4,'16 to 25 years old':2, 'Under 16':1, 'Not Available':0})
df['geography']=df['geography'].map({'Suburban/Peri-urban':2, 'City center or metropolitan area':3, 'Rural':1,'Not Available':0})
df['financial_situation']=df['financial_situation'].map({'I can afford food and regular expenses, but nothing else':3,
       'I cannot afford enough food for my family':1,
       'I can comfortably afford food, clothes, and furniture, and I have savings':6,
       'I can afford food, but nothing else':2,
       'I can comfortably afford food, clothes, and furniture, but I don’t have savings':5,
       'I can afford food, regular expenses, and clothes, but nothing else':4,
       'Prefer not to answer':0, 'Not Available':0})
df['education']=df['education'].map({'University or college degree completed':4,
       'Technical school diploma or degree completed':5,
       'Some technical education (e.g polytechnic school)':3,
       'Some university or college':4,
       'Secondary school/ high school completed':2,
       'Primary school completed':1, 'Some primary education':1,
       'Some secondary school / high school':2, 'Post-graduate education':5,
       'Prefer not to answer':0, 'No formal education':0, 'Not available':0,
       'College or university':4, 'Post graduate':5, 'Technical school':3,
       'Secondary/high school':2})
df['employment_status']=df['employment_status'].map({'I am unemployed':0,
       'I work full-time, either as an employee or self-employed':6,
       'I do housework, fulfilling domestic tasks, looking after children':4,
       'I work part-time, either as an employee or self-employed':2,
       'I am retired':3, 'I am a student and I work part-time':2,
       'I am a student':1, 'None of the above':0,
       'I am doing community or military service':5,
       'I am unable to work due to long-term illness or disability':0,
       'Not Available':0, 'Employed full-time':6, 'Retired':3, 'Student':1,
       'Unemployed':0})
df['do_children_3_and_17_yrs_receive_regular_school_meals']=df['do_children_3_and_17_yrs_receive_regular_school_meals'].map({'Every day':5, 'No':0, '2 days':2, '4 days':4, '1 day':1, '3 days':3})
df['are_there_teachers_at_scheduled_class_hours']=df['are_there_teachers_at_scheduled_class_hours'].map({'Irregularly':0, 'There are not enough':1, 'There are enough':2})

In [None]:
df

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(df.iloc[:,2:].corr(),annot=True)

In [None]:
from sklearn.cluster import KMeans
clust_df=KMeans(n_clusters=4).fit_predict(df.iloc[:,2:])
clust_df=pd.DataFrame(clust_df,columns=['cluster'])

In [None]:
df['cluster']=clust_df['cluster']

In [None]:
df.groupby('cluster')['submission_id'].count().plot.bar()

In [None]:
df.groupby('cluster').mean().iloc[:,1:].plot.bar(figsize=(15,10))
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0, fontsize=10)

In [None]:
from sklearn.decomposition import TruncatedSVD,PCA
pca=PCA(2)
pca_df=pca.fit_transform(df.iloc[:,2:])
pca_df=pd.DataFrame(pca_df,columns=['PC1','PC2'])
pca_df['gender'] = df.gender
pca_df['age'] = df.age
pca_df['geography'] = df.geography
pca_df['financial_situation']=df.financial_situation
pca_df['education']=df.education
pca_df['employment_status']=df.employment_status
pca_df['cluster']=df.cluster

In [None]:
pca_df

In [None]:
sns.barplot(x=['PC_1','PC_2'],y=pca.explained_variance_ratio_)
sns.pointplot(x=['PC_1','PC_2'],y=np.cumsum(pca.explained_variance_ratio_),lw=5,legend=True,label='Cumulative',color='tab:red')
plt.ylabel('Explained Variance')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))

sns.scatterplot(x=pca_df['PC1'],y=pca_df['PC2'],size=pca_df['geography'],hue=pca_df['cluster'],sizes=(10, 200))
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0, fontsize=10)

In [None]:
plt.figure(figsize=(10, 5))

sns.scatterplot(x=pca_df['PC1'],y=pca_df['PC2'],size=pca_df['financial_situation'],hue=pca_df['cluster'],sizes=(10, 200))
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0, fontsize=10)

In [None]:
plt.figure(figsize=(10, 5))

sns.scatterplot(x=pca_df['PC1'],y=pca_df['PC2'],size=pca_df['education'],hue=pca_df['cluster'],sizes=(10, 200))
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0, fontsize=10)

In [None]:
plt.figure(figsize=(10, 5))

sns.scatterplot(x=pca_df['PC1'],y=pca_df['PC2'],size=pca_df['employment_status'],hue=pca_df['cluster'],sizes=(10, 200))
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0, fontsize=10)