# 1. Import Dependencies and Dataset

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [11]:
df = pd.read_csv('./data/survey_results_public.csv')
df_schema = pd.read_csv('./data/survey_results_schema.csv')

# 2. Understand acquired data

In [14]:
df.shape

(89184, 84)

In [15]:
df.head().T

Unnamed: 0,0,1,2,3,4
ResponseId,1,2,3,4,5
Q120,I agree,I agree,I agree,I agree,I agree
MainBranch,None of these,I am a developer by profession,I am a developer by profession,I am a developer by profession,I am a developer by profession
Age,18-24 years old,25-34 years old,45-54 years old,25-34 years old,25-34 years old
Employment,,"Employed, full-time","Employed, full-time","Employed, full-time","Employed, full-time;Independent contractor, fr..."
...,...,...,...,...,...
ProfessionalTech,,DevOps function;Microservices;Automated testin...,DevOps function;Microservices;Automated testin...,Automated testing;Continuous integration (CI) ...,Microservices;Automated testing;Observability ...
Industry,,"Information Services, IT, Software Development...","Information Services, IT, Software Development...",,Other
SurveyLength,,Appropriate in length,Appropriate in length,Appropriate in length,Appropriate in length
SurveyEase,,Easy,Easy,Easy,Neither easy nor difficult


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89184 entries, 0 to 89183
Data columns (total 84 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ResponseId                           89184 non-null  int64  
 1   Q120                                 89184 non-null  object 
 2   MainBranch                           89184 non-null  object 
 3   Age                                  89184 non-null  object 
 4   Employment                           87898 non-null  object 
 5   RemoteWork                           73810 non-null  object 
 6   CodingActivities                     73764 non-null  object 
 7   EdLevel                              87973 non-null  object 
 8   LearnCode                            87663 non-null  object 
 9   LearnCodeOnline                      70084 non-null  object 
 10  LearnCodeCoursesCert                 37076 non-null  object 
 11  YearsCode                   

In [29]:
# Check missing value
pd.set_option('display.max_columns', None)

df.isna().sum()

ResponseId                                 0
Q120                                       0
MainBranch                                 0
Age                                        0
Employment                              1286
RemoteWork                             15374
CodingActivities                       15420
EdLevel                                 1211
LearnCode                               1521
LearnCodeOnline                        19100
LearnCodeCoursesCert                   52108
YearsCode                               1749
YearsCodePro                           23048
DevType                                12312
OrgSize                                24141
PurchaseInfluence                      24220
TechList                               28333
BuyNewTool                              6175
Country                                 1211
Currency                               23850
CompTotal                              40959
LanguageHaveWorkedWith                  2044
LanguageWa

In [26]:
columns_to_check = [
    'Employment', 'RemoteWork', 'CodingActivities', 'EdLevel',
    'LearnCode', 'LearnCodeOnline', 'LearnCodeCoursesCert', 'YearsCode', 'YearsCodePro', 'DevType', 'OrgSize',
    'PurchaseInfluence', 'TechList', 'BuyNewTool', 'Country', 'Currency', 'CompTotal', 'LanguageHaveWorkedWith',
    'LanguageWantToWorkWith', 'DatabaseHaveWorkedWith', 'DatabaseWantToWorkWith', 'PlatformHaveWorkedWith',
    'PlatformWantToWorkWith', 'WebframeHaveWorkedWith', 'WebframeWantToWorkWith', 'MiscTechHaveWorkedWith',
    'MiscTechWantToWorkWith', 'ToolsTechHaveWorkedWith', 'ToolsTechWantToWorkWith', 'NEWCollabToolsHaveWorkedWith',
    'NEWCollabToolsWantToWorkWith', 'OpSysPersonal use', 'OpSysProfessional use', 'OfficeStackAsyncHaveWorkedWith',
    'OfficeStackAsyncWantToWorkWith', 'OfficeStackSyncHaveWorkedWith', 'OfficeStackSyncWantToWorkWith',
    'AISearchHaveWorkedWith', 'AISearchWantToWorkWith', 'AIDevHaveWorkedWith', 'AIDevWantToWorkWith', 'NEWSOSites',
    'SOVisitFreq', 'SOAccount', 'SOPartFreq', 'SOComm', 'SOAI', 'AISelect', 'AISent', 'AIAcc', 'AIBen',
    'AIToolInterested in Using', 'AIToolCurrently Using', 'AIToolNot interested in Using', 'AINextVery different',
    'AINextNeither different nor similar', 'AINextSomewhat similar', 'AINextVery similar', 'AINextSomewhat different',
    'TBranch', 'ICorPM', 'WorkExp', 'Knowledge_1', 'Knowledge_2', 'Knowledge_3', 'Knowledge_4', 'Knowledge_5',
    'Knowledge_6', 'Knowledge_7', 'Knowledge_8', 'Frequency_1', 'Frequency_2', 'Frequency_3', 'TimeSearching',
    'TimeAnswering', 'ProfessionalTech', 'Industry', 'SurveyLength', 'SurveyEase', 'ConvertedCompYearly'
]

for column in columns_to_check:
    missing_ratio = (df[column].isna().sum() / len(df)) * 100
    print(f"{column} 변수의 결측값 비율: {missing_ratio:.2f}%")


Employment 변수의 결측값 비율: 1.44%
RemoteWork 변수의 결측값 비율: 17.24%
CodingActivities 변수의 결측값 비율: 17.29%
EdLevel 변수의 결측값 비율: 1.36%
LearnCode 변수의 결측값 비율: 1.71%
LearnCodeOnline 변수의 결측값 비율: 21.42%
LearnCodeCoursesCert 변수의 결측값 비율: 58.43%
YearsCode 변수의 결측값 비율: 1.96%
YearsCodePro 변수의 결측값 비율: 25.84%
DevType 변수의 결측값 비율: 13.81%
OrgSize 변수의 결측값 비율: 27.07%
PurchaseInfluence 변수의 결측값 비율: 27.16%
TechList 변수의 결측값 비율: 31.77%
BuyNewTool 변수의 결측값 비율: 6.92%
Country 변수의 결측값 비율: 1.36%
Currency 변수의 결측값 비율: 26.74%
CompTotal 변수의 결측값 비율: 45.93%
LanguageHaveWorkedWith 변수의 결측값 비율: 2.29%
LanguageWantToWorkWith 변수의 결측값 비율: 9.50%
DatabaseHaveWorkedWith 변수의 결측값 비율: 17.66%
DatabaseWantToWorkWith 변수의 결측값 비율: 31.70%
PlatformHaveWorkedWith 변수의 결측값 비율: 28.66%
PlatformWantToWorkWith 변수의 결측값 비율: 42.47%
WebframeHaveWorkedWith 변수의 결측값 비율: 24.94%
WebframeWantToWorkWith 변수의 결측값 비율: 36.38%
MiscTechHaveWorkedWith 변수의 결측값 비율: 36.07%
MiscTechWantToWorkWith 변수의 결측값 비율: 47.47%
ToolsTechHaveWorkedWith 변수의 결측값 비율: 12.67%
ToolsTechWantToWorkWith 

# 3. Write Worksheets

# 4. Data Preprocessing

### How to preprocess missing value

# 5. Data Visualization(Data Analysis)

# 6. Conclusion and Summary