In [1]:
# pasting the previous video people dictionary data

people = {
    "first": ['srinu','sow','abc'],
    "last" : ['balireddy','yalla','abc'],
    "email": ['abc@abc.com','kbc@kbc.com','abc@abc.com']
}

import pandas as pd
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email
0,srinu,balireddy,abc@abc.com
1,sow,yalla,kbc@kbc.com
2,abc,abc,abc@abc.com


In [2]:
# in the above result 0,1,2 are integer identifiers of the rows. 
# it will more significant if we can create an index with an existing column. 
# This will basically be the label for that row which are usually unique
# pandas by default DOESN'T enforce indexes to be unique but most of the time these will be unique

# making email as index for the above data

df.set_index('email')

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
abc@abc.com,srinu,balireddy
kbc@kbc.com,sow,yalla
abc@abc.com,abc,abc


In [3]:
# if we run the df again, it will show the old data with 0,1,2 index
# that's because pandas DOESN'T do inplace modification
# we have to explicitly mention inplace=True parameter while setting the index

df.set_index("email", inplace=True)
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
abc@abc.com,srinu,balireddy
kbc@kbc.com,sow,yalla
abc@abc.com,abc,abc


In [4]:
# to display the indexes

df.index

Index(['abc@abc.com', 'kbc@kbc.com', 'abc@abc.com'], dtype='object', name='email')

In [6]:
# now we can query a specific row using a lable using loc method

df.loc['abc@abc.com']

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
abc@abc.com,srinu,balireddy
abc@abc.com,abc,abc


In [7]:
# query row and columns using labels

df.loc['abc@abc.com','last']

email
abc@abc.com    balireddy
abc@abc.com          abc
Name: last, dtype: object

In [14]:
# the index 0,1,2 are no longer available and it will throw an error if we use index number with loc

df.loc[0]    # TypeError


# we can still use index with iloc

df.iloc[0]



first        srinu
last     balireddy
Name: abc@abc.com, dtype: object

In [15]:
# to revert back the index we have created , reset_index can be used

df.reset_index(inplace=True)
df

Unnamed: 0,email,first,last
0,abc@abc.com,srinu,balireddy
1,kbc@kbc.com,sow,yalla
2,abc@abc.com,abc,abc


In [20]:
# now lets go back to stackover flow dataset and manipulate the data using pandas.
# just consolidate the csv file reading code 


import pandas as pd

df        = pd.read_csv('data/survey_results_public.csv')
schema_df = pd.read_csv('data/survey_results_schema.csv')

pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [21]:
df.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,...,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,...,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,...,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,...,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,...,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
4,5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",...,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy


In [25]:
# insted of using set_index method to set the index.
# we can directly set the index while loading the file into df.
# the stackoverflow data has respondent data which is unique and can be used as index

df        = pd.read_csv('data/survey_results_public.csv', index_col="Respondent")

In [26]:
df.head()

# now the index 0,1,2... values are gone.

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,...,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",...,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",...,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,...,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,...,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,...,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy


In [27]:
# to check what a specific column in the df meant instead of always searching the schema_df
# for this ,we can set the column name of schema_df as index and search the df column in schema_df

schema_df = pd.read_csv('data/survey_results_schema.csv', index_col="Column")

schema_df.loc["Hobbyist"]

QuestionText    Do you code as a hobby?
Name: Hobbyist, dtype: object

In [31]:
# in the above result , the question text is not completely visible.
# we can display the question text column to view the complete text

schema_df.loc["Hobbyist","QuestionText"]

'Do you code as a hobby?'

In [34]:
# sorting the indexes

schema_df.sort_index()
schema_df.sort_index(ascending=False, inplace=True)

In [35]:
schema_df

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
YearsCodePro,How many years have you coded professionally (...
YearsCode,"Including any education, how many years have y..."
WorkWeekHrs,"On average, how many hours per week do you work?"
WorkRemote,How often do you work remotely?
WorkPlan,How structured or planned is your work?
...,...
BlockchainOrg,How is your organization thinking about or imp...
BlockchainIs,Blockchain / cryptocurrency technology is prim...
BetterLife,Do you think people born today will have a bet...
Age1stCode,At what age did you write your first line of c...
