In [1]:
import pandas as pd

df = pd.read_csv('data/mpg.csv', index_col = 0)   # panda_obj.read_csv is used to read csv files
df.head()   # returns the first five rows
# index_col parameter in read_csv removes the columns at the given position. here the zeroth indexes
# and columns have been disregarded
df.columns.name = 'Serial No.'

In [2]:
dfcen = pd.read_csv('data/census.csv')
dfcen.index.name = 'Serial No.'

dfcen.columns = [x.capitalize() for x in dfcen.columns]
dfcen.head()

# how to manipulate indexes
# you can also remove an index using the reset_index() function, or change the index name using 
# set_index() function

dfcen.reset_index()    # by doing this the index is then added to the columns section
dfcen.set_index(['Stname', 'Ctyname'])  # you can insert any columns as an index

# you can also get particular city values using the state names (multiple indexing) usinng a tuple
dfcen = dfcen.set_index(['Stname', 'Ctyname'])
dfcen.loc[[('Alabama', 'Autauga County'), 
                 ('Wyoming', 'Sweetwater County')]]  # loc is used to locate the desired output
# using the index


Unnamed: 0_level_0,Unnamed: 1_level_0,Sumlev,Region,Division,State,County,Census2010pop,Estimatesbase2010,Popestimate2010,Popestimate2011,Popestimate2012,...,Rdomesticmig2011,Rdomesticmig2012,Rdomesticmig2013,Rdomesticmig2014,Rdomesticmig2015,Rnetmig2011,Rnetmig2012,Rnetmig2013,Rnetmig2014,Rnetmig2015
Stname,Ctyname,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50,3,6,1,1,54571,54571,54660,55253,55175,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
Wyoming,Sweetwater County,50,4,8,56,37,43806,43806,43593,44041,45104,...,1.072643,16.243199,-5.339774,-14.252889,-14.248864,1.255221,16.243199,-5.29546,-14.075283,-14.070195


### Missing values in pandas dataframes

In [3]:
dfgrades = pd.read_csv('data/grades.csv')
dfgrades.index.name = 'Serial Number'
dfgrades.isnull().head()

Unnamed: 0_level_0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
Serial Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False


In [41]:
dflog = pd.read_csv('data/log.csv')
dflog.head(10)
# you can mask the data using isnull() function which returns the dataframe in boolean values
# and is True if there are any null values (or Nan, or None)

maskex = dflog.isnull()
maskex.head()

# you can drop the missing values from the data using the dropna() function
dflog.dropna() # but using this explictily removes all the rows even if there is only one missing 
# value in each row

# to fill in the missing values implicitly, pandas has a function called the 'fillna'
# you can insert a scalar value or any other value in place the missing values
dflog.fillna('Missing').head(8)


# you can as well use 'Regex' to edit and replace values

dflog.replace(to_replace = '\S.*html', value = 'webpage', regex = True).head(8)

# 'to_replace' is used while using a regular expression (1st Parameter), 'value' is used to give the 
# new value to the matched strings (2nd Parameters), 'regex = True' (3rd Parameter)


# if you are using just he replace function, the first parameter is the value that is to replaced
# the second parameter is the new value to be given
dflog.replace(10.0, 100.0)
dflog.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


If you are using statistical functions in a dataframe, pandas ignore the missing values while doing the calculations

### Basic Data Cleaning Process

In [64]:
hf = 'George Washington'
import re

item = re.finditer('(?P<first>.+)\s(?P<last>.*)', hf)
lsr = [i.groupdict() for i in item]
print(lsr)

[{'first': 'George', 'last': 'Washington'}]


In [83]:
df = pd.DataFrame([{'Name': 'Alice', 'Age': 20, 'Gender': 'F'}, {'Name': 'Jack', 'Age': 22, 'Gender': 'M'}],
                 index = ['Mathematics', 'Sociology'])
df.columns.name = '(Major)'
df

df.T['Mathematics']

(Major)
Name      Alice
Age          20
Gender        F
Name: Mathematics, dtype: object