<img src="https://i.imgur.com/6U6q5jQ.png"/>

# DATA FRAMES IN R and Python

In [None]:
%load_ext rpy2.ipython

**Data frames**  are more complex containers of values. The most common analogy is a spreadsheet.

## 1. Creating

In [None]:
namesP=["Qing", "Françoise", "Raúl", "Bjork","Marie"]
agesP=[32,33,28,30,29]
countryP=["China", "Senegal", "España", "Norway","Korea"]
educationP=["Bach", "Bach", "Master", "PhD","PhD"]

classroomP=dict(student=namesP,age=agesP,edu=educationP,country=countryP)

import pandas as pd

# our data frame:
studentsP=pd.DataFrame(classroomP)
## see it:
studentsP


In [None]:

%%R
namesR=c("Qing", "Françoise", "Raúl", "Bjork","Marie")
agesR=c(32,33,28,30,29)
countryR=c("China", "Senegal", "España", "Norway","Korea")
educationR=c("Bach", "Bach", "Master", "PhD","PhD")

classroomR=list(student=namesR,age=agesR,edu=educationR,country=countryR)

studentsR=as.data.frame(do.call(cbind,classroomR))

studentsR



## 2. Accessing

In [None]:
studentsP.iloc[:,0] # ":" means 'all'; "iloc" requests positions (indices)

In [None]:
%%R
studentsR[,1] # "" means 'all'.

In [None]:
studentsP.iloc[:,[1,2]] # indices in a list

In [None]:
%%R
studentsR[,c(2,3)] # indices in a vector

In [None]:
studentsP.loc[:,'student'] # "loc" requires labels (not positions)

In [None]:
%%R
studentsR[,'student'] # also valid in R

In [None]:
studentsP.loc[:,['student','edu']]

In [None]:
%%R
studentsR[,c('student','edu')]

In [None]:
studentsP.student # one column in Pandas

In [None]:
%%R
studentsR$student # one column in R (use $)

In [None]:
studentsP.loc[2,'student'] # row with label 2, info about 'student'

In [None]:
studentsP.iloc[2,0] # row with position 2, info about 'student'

In [None]:
%%R
studentsR[3,'student']

In [None]:
studentsP.loc[[2,4],['student','edu']]

In [None]:
%%R
studentsR[c(3,5),c('student','edu')]

## 3. Replacing

In [None]:
studentsP.loc[2,'student']='Lito'
studentsP

In [None]:
%%R

studentsR[3,'student']='Lito'
studentsR

In [None]:
studentsP.loc[[2,4],'age']=[32,31]
studentsP

In [None]:
%%R

studentsR[c(3,5),'age']=c(32,31)
studentsR

## 4. Deleting

In [None]:
# make copy
studentsP_new=studentsP.copy()

In [None]:
%%R
# make copy
studentsR_new=studentsR

### Deleting rows


In [None]:
byeRows=[2,3]
studentsP_new.drop(index=byeRows,inplace=True) #inplace=inmediately
#then
studentsP_new

As you see, the indexes dissapeared. Then, you should reset the indexes:

In [None]:
studentsP_new.reset_index(drop=True,inplace=True)
#then
studentsP_new

In [None]:
%%R
byeRows=c(3,4)
studentsR_new=studentsR_new[-byeRows,] # rewriting (NO 'inplace')
#then
studentsR_new

In [None]:
%%R
#reset indexes
row.names(studentsR_new)=NULL
#then
studentsR_new

### Deleting columns

In [None]:
# This is what you want get rid of:
byeColumns=['edu'] # you can delete more than one

#this is the result
studentsP_new.drop(columns=byeColumns,inplace=True)
#then
studentsP_new

In [None]:
%%R

byeColumns=c('edu') # this doesn't work: studentsR_new[,-byeColumns]
keepCols=setdiff(names(studentsR_new),byeColumns)
studentsR_new=studentsR_new[,keepCols]
#then
studentsR_new


### Deleting cells

In [None]:
studentsP_new.loc[2,'country']=pd.NA
#then
studentsP_new


In [None]:
%%R
studentsR_new[3,'country']=NA
#then
studentsR_new

## Inserting

In [None]:
#currently
studentsP

In [None]:
femaleP=[True,True,False,False,True]
studentsP1=studentsP.assign(female=femaleP)
#then
studentsP1

In [None]:
#another way
studentsP2=studentsP.copy()
studentsP2['female']=femaleP
#then
studentsP2

In [None]:
#yet another way
studentsP3=studentsP.copy()
studentsP3.loc[:,'female']=femaleP
studentsP3

In [None]:
%%R
femaleR=c(T,T,F,F,T)
studentsR1=cbind(studentsR,female=femaleR)
studentsR1

In [None]:
%%R
studentsR2=studentsR
studentsR2$female=femaleR
studentsR2

In [None]:
%%R
studentsR3=studentsR
studentsR3[,'female']=femaleR
studentsR3

## Other basic operations

In [None]:
# data of structure: list? tuple? dataframe?
type(studentsP)

In [None]:
%%R
class(studentsR)

In [None]:
# type of data in data frame column
studentsP.info()

In [None]:
# details of data frame
%%R

str(studentsR)

In [None]:
# number of rows and columns
studentsP.shape

In [None]:
%%R
dim(studentsR)

In [None]:
# number of rows:
len(studentsP)

In [None]:
%%R

length(studentsR)

In [None]:
# first rows
studentsP.head(2) # compare with: studentsP.tail(2)

In [None]:
%%R
head(studentsR,2) # compare with: tail(studentsR,2)

In [None]:
# name of columns
studentsP.columns

In [None]:
%%R
names(studentsR)

## Queries

In [None]:
studentsP1.iloc[0,1]=33
studentsP1

In [None]:
#who is the oldest?

studentsP1[studentsP1.age==max(studentsP1.age)]

In [None]:
studentsP1[studentsP1.age==studentsP1.age.max()]['student']

In [None]:
%%R
studentsR1[1,2]=33
studentsR1


In [None]:
%%R

#who is the oldest?

studentsR1[which.max(studentsR1$age),]

In [None]:
%%R
studentsR1[studentsR1$age==max(studentsR1$age),]

In [None]:
%%R
studentsR1[studentsR1$age==max(studentsR1$age),'student']

In [None]:
#who has PhD?

studentsP1[studentsP1.edu=='PhD']

In [None]:
%%R
studentsR1[studentsR1$edu=='PhD',]

In [None]:
#who has PhD or Master?
studentsP1[studentsP1.edu.isin(['PhD','Master'])]

In [None]:
%%R
studentsR1[studentsR1$edu %in% c('PhD','Master'),]

In [None]:
#who does not has PhD or Master?
studentsP1[~studentsP1.edu.isin(['PhD','Master'])]

In [None]:
%%R
studentsR1[!studentsR1$edu %in% c('PhD','Master'),]

In [None]:
#the youngest female
studentsP1[studentsP1.female]

In [None]:
studentsP1[studentsP1.female].sort_values(by=['age'],ascending=True).iloc[0,0]

In [None]:
%%R
studentsR1[studentsR1$female,]

In [None]:
%%R
tail(studentsR1[studentsR1$female,][order(studentsR1$age)],1)

In [None]:
studentsP1[studentsP1.female & studentsP1.age==studentsP1.age.min()]

In [None]:
femdf=studentsP1[studentsP1.female]
femdf

In [None]:
femdf[femdf.age==femdf.age.min()]