Grouping and Aggregating - Analyzing and Exploring Your Data

In [21]:
import pandas as pd

In [22]:
region = pd.read_csv('Data/administrative-divisions/Region.csv', index_col='region_id')
district = pd.read_csv('Data/administrative-divisions/Districts.csv', index_col='district_id', dtype={'district_id':'Int64', 'old_region' : 'Int64', 'new_region':'Int64'})
local_types = pd.read_csv('Data/administrative-divisions/LocalBodyTypes.csv', index_col='local_body_type_id')
local_bodies = pd.read_csv('Data/administrative-divisions/localBodies.csv', index_col='local_body_id')

In [23]:
filter1 = (region['Region'] == 'Bagmati')    &  (region['old_new'] == 1)
filter1_index = region.loc[filter1].index
filter1_index

Int64Index([1103], dtype='int64', name='region_id')

In [24]:
bag_dis_filter = district['new_region'] == filter1_index[0]
bagmati_districts = district.loc[bag_dis_filter, ['District']]
bagmati_districts.sort_values(by='District', ascending=False)

Unnamed: 0_level_0,District
district_id,Unnamed: 1_level_1
100108,Sindhupalchok
100206,Sindhuli
100107,Rasuwa
100204,Ramechhap
100106,Nuwakot
100303,Makwanpur
100105,Lalitpur
100104,Kavrepalanchok
100103,Kathmandu
100202,Dolakha


In [25]:
# Let's find all the local bodies of Kavrepalanchok
kavrefilter = (bagmati_districts['District'] == 'Kavrepalanchok')
kavre_index = bagmati_districts.loc[kavrefilter].index
kavre_index

Index([100104], dtype='object', name='district_id')

In [26]:
kavre_local_filter = (local_bodies['district_id'] == kavre_index[0])
kavre_local = local_bodies.loc[kavre_local_filter, ['local_body', 'local_body_type_id', 'max_ward']]
kavre_local

Unnamed: 0_level_0,local_body,local_body_type_id,max_ward
local_body_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1001045001,Banepa,105,
1001045002,Dhulikhel,105,
1001045003,Panauti,105,
1001047001,Anekot,107,9.0
1001047002,Balthali,107,9.0
...,...,...,...
1101045161,Namobuddha,105,11.0
1101045162,Panauti,105,12.0
1101045163,Panchkhal,105,13.0
1101049466,Roshi,119,12.0


In [27]:
# Working with example dataframe
people = {
    "first" : ["Anish", "Ramish", "Samish", "Bamish", "Bamish"],
    "last" : ["Khadka", "Mainali", "Shrestha", "Karki", "Mainali"],
    "email" : ["anishramish56@gmail.com", "mainaliramish89@gmail.com", 
               "shresthasamish28@gmail.com", "bamishkarki819@gmail.com",
               "bamishmainali78@gmail.com"]
}
mydf = pd.DataFrame(people)
mydf

Unnamed: 0,first,last,email
0,Anish,Khadka,anishramish56@gmail.com
1,Ramish,Mainali,mainaliramish89@gmail.com
2,Samish,Shrestha,shresthasamish28@gmail.com
3,Bamish,Karki,bamishkarki819@gmail.com
4,Bamish,Mainali,bamishmainali78@gmail.com


In [28]:
# Aggregation : multiple pieces of data into a single result
# df['column_name'].median()
# df.median() # For entire data frame where there is numerical value
# df.describe()


In [29]:
# Reading real data
df = pd.read_csv('Data/stack-overflow-developer-survey-2024/survey_results_public.csv', index_col='ResponseId')
schema_df = pd.read_csv('Data/stack-overflow-developer-survey-2024/survey_results_schema.csv', index_col='qname')

In [30]:
df.describe()

Unnamed: 0,CompTotal,WorkExp,JobSatPoints_1,JobSatPoints_4,JobSatPoints_5,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,ConvertedCompYearly,JobSat
count,33740.0,29658.0,29324.0,29393.0,29411.0,29450.0,29448.0,29456.0,29456.0,29450.0,29445.0,23435.0,29126.0
mean,2.963841e+145,11.466957,18.581094,7.52214,10.060857,24.343232,22.96522,20.278165,16.169432,10.955713,9.953948,86155.29,6.935041
std,5.444117e+147,9.168709,25.966221,18.422661,21.833836,27.08936,27.01774,26.10811,24.845032,22.906263,21.775652,186757.0,2.088259
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,60000.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32712.0,6.0
50%,110000.0,9.0,10.0,0.0,0.0,20.0,15.0,10.0,5.0,0.0,0.0,65000.0,7.0
75%,250000.0,16.0,22.0,5.0,10.0,30.0,30.0,25.0,20.0,10.0,10.0,107971.5,8.0
max,1e+150,50.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,16256600.0,10.0


In [31]:
df['ConvertedCompYearly'].median() # median salary

65000.0

In [32]:
df['ConvertedCompYearly'].count()

23435

In [33]:
df['ConvertedCompYearly'].value_counts()

64444.0     321
53703.0     308
75184.0     230
85925.0     226
107406.0    208
           ... 
10782.0       1
28821.0       1
107818.0      1
29546.0       1
48742.0       1
Name: ConvertedCompYearly, Length: 6113, dtype: int64

In [38]:
country_grp = df.groupby(['Country'])

In [44]:
country_grp.get_group('United States of America')

Unnamed: 0_level_0,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,TechDoc,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,,...,,,,,,,,,,
6,I code primarily as a hobby,Under 18 years old,"Student, full-time",,Apples,,Primary/elementary school,"School (i.e., University, College, etc);Online...",,,...,,,,,,,Appropriate in length,Easy,,
7,"I am not primarily a developer, but I write co...",35-44 years old,"Employed, full-time",Remote,Apples,I don’t code outside of work,"Professional degree (JD, MD, Ph.D, Ed.D, etc.)","Other online resources (e.g., videos, blogs, f...",Technical documentation;Stack Overflow;Written...,,...,,,,,,,Too long,Neither easy nor difficult,,
11,"I used to be a developer by profession, but no...",35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Other online resources ...,Technical documentation;Books;Written Tutorial...,API document(s) and/or SDK document(s);User gu...,...,25.0,10.0,0.0,15.0,0.0,0.0,Appropriate in length,Easy,,8.0
13,I am a developer by profession,35-44 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Apples,Hobby;Contribute to open-source projects;Profe...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;On the job training;Sch...,,,...,30.0,0.0,0.0,20.0,10.0,10.0,Appropriate in length,Easy,,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65257,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Bootstrapping a business;Professional developm...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",On the job training;Other online resources (e....,Technical documentation;Blogs;Written Tutorial...,API document(s) and/or SDK document(s);User gu...,...,,,,,,,,,,
65262,I am a developer by profession,18-24 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Apples,Hobby,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;Coding sessions (live or record...,,...,,,,,,,,,,
65269,I am a developer by profession,25-34 years old,"Employed, full-time;Independent contractor, fr...",Remote,Apples,Bootstrapping a business;Professional developm...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Other online resources ...,Technical documentation;Interactive tutorial;V...,API document(s) and/or SDK document(s);User gu...,...,,,,,,,,,,
65270,"I am not primarily a developer, but I write co...",45-54 years old,"Employed, full-time",Remote,Apples,Professional development or self-paced learnin...,Some college/university study without earning ...,Books / Physical media;Other online resources ...,Technical documentation;Blogs;Books;Written Tu...,API document(s) and/or SDK document(s);User gu...,...,,,,,,,,,,


ResponseId
1                                 United States of America
2        United Kingdom of Great Britain and Northern I...
3        United Kingdom of Great Britain and Northern I...
4                                                   Canada
5                                                   Norway
                               ...                        
65433                                                  NaN
65434                                                  NaN
65435                                                  NaN
65436                                              Germany
65437                                                  NaN
Name: Country, Length: 65437, dtype: object