In [54]:
from matplotlib import pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
from collections import Counter             # Used to tally the languages
from itertools import chain

In [55]:
# Two versions of the file - Edited is a slimed down version and full is the unedited version
file = ("Resources/survey_results_2021_edited.csv")          # Slimed down
df = pd.read_csv(file)                                     # Read into df  pandas dataframe
df.head(1)


Unnamed: 0,ResponseId,MainBranch,Country,US_State,UK_Country,LanguageHaveWorkedWith,LanguageWantToWorkWith,DatabaseHaveWorkedWith,DatabaseWantToWorkWith,Age,Gender,Sexuality,Ethnicity
0,1,I am a developer by profession,Slovakia,,,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift,PostgreSQL;SQLite,SQLite,25-34 years old,Man,Straight / Heterosexual,White or of European descent


In [56]:
# Second file that we could use - has all data with no columns removed
full_file = ("Resources/survey_results_public.csv")           # Unedited file
full_df = pd.read_csv(full_file)                              # Read into full_df pandas dataframe
full_df.head(1)

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Slovakia,,,"Secondary school (e.g. American high school, G...",18 - 24 years,Coding Bootcamp;Other online resources (ex: vi...,,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,62268.0


In [57]:
# Dropping columns from df that will not be used.  Might keep this?
drop = ['Sexuality', 'Ethnicity', 'DatabaseHaveWorkedWith', 'DatabaseWantToWorkWith']
df.drop(drop, inplace=True, axis=1)    

 # Working to split language up so we can count how many people use which lanaguge 
df.dropna(axis='index', how='all', subset=['LanguageHaveWorkedWith'])
df.rename(columns={ 'MainBranch': 'Main Branch', 'LanguageHaveWorkedWith': 'Languages Learned', 'LanguageWantToWorkWith': 'Future Goal Language'}, inplace=True)

In [58]:
# This fills in missing data from the data frame with "Not Applicable"
df.replace(np.nan, "Not Applicable").head(5)
# df.fillna('Not Applicable')



Unnamed: 0,ResponseId,Main Branch,Country,US_State,UK_Country,Languages Learned,Future Goal Language,Age,Gender
0,1,I am a developer by profession,Slovakia,Not Applicable,Not Applicable,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift,25-34 years old,Man
1,2,I am a student who is learning to code,Netherlands,Not Applicable,Not Applicable,JavaScript;Python,Not Applicable,18-24 years old,Man
2,3,"I am not primarily a developer, but I write co...",Russian Federation,Not Applicable,Not Applicable,Assembly;C;Python;R;Rust,Julia;Python;Rust,18-24 years old,Man
3,4,I am a developer by profession,Austria,Not Applicable,Not Applicable,JavaScript;TypeScript,JavaScript;TypeScript,35-44 years old,Man
4,5,I am a developer by profession,United Kingdom of Great Britain and Northern I...,Not Applicable,England,Bash/Shell;HTML/CSS;Python;SQL,Bash/Shell;HTML/CSS;Python;SQL,25-34 years old,Man


In [59]:
#TODO: Clean up or combine loops - sort the data - Maybe make into a method

# Pull series LanguageHaveWorkedWith and find the top 20 popular languages
# Isolate Column - split langs into a list - count number of repeats?
find_top_langs = df['Languages Learned']            # Isolate column to work with
cnt = Counter()                                     # Create a counter object
temp_list = []                                      # PLace holder list to hold a list of lists
flat_list = []                                      # "flat" list that makes a 2d list a 1d.  Gets rid of nested list

# This for loop creates a list from each row in the series and then from that a list to break up the different langs
# Example String ->  ['C++;Python;PHP'] -> Split ['C++', 'Python', 'PHP'] - For each Row
for langs in find_top_langs:
    langs = str(langs)
    temp_list.append(langs.split(';'))

# This opens up the the list of lists and makes it into one gaint list
for temp in temp_list:              # temp is a list inside a list [[list1], [list2], [list3]]
    for flat in temp:               # flat is the element in list1, list2, list3
        flat_list.append(flat)      # New List with no lists - just elements [list1, list2, list3]

for temp in flat_list:              # Counts and tallys how many times it is each element
    cnt[temp] += 1

# cnt                                 # Outputs the data as a dictionary

cnt_df = pd.DataFrame(cnt.items()).sort_values(by=[1], ascending=False)     # Convert to a dataframe to sort
cnt_df.set_index(0).reset_index().head(20)                                  # Looks weird but I set the index to the lang name then reset so we have an index to count by

Unnamed: 0,0,1
0,JavaScript,53587
1,HTML/CSS,46259
2,Python,39792
3,SQL,38835
4,Java,29162
5,Node.js,27975
6,TypeScript,24909
7,C#,22984
8,Bash/Shell,22385
9,C++,20057
