In [72]:
library(regclass)
library(ppcor)
library(tseries)
library(dynlm)
library(lmtest)


Attaching package: ‘lmtest’


The following object is masked from ‘package:VGAM’:

    lrtest




In [137]:
# return the R squared value of guo ~ guo_history + yin_history
self_autoregression <- function(x,y,lags){
    n <- length(x)
    x_hist <- x[1:(n-lags)]
    y_hist <- y[1:(n-lags)]
    Y <- y[(lags+1):n]
    m <- lm(Y~x_hist+y_hist+0)
    return (summary(m)$r.squared)
}

# select best time lags
select_time_lags <- function(z,thres = 0.1){
    pacf = pacf(z,20,,plot = FALSE)
    if(thres==0.1){
        n <- pacf$n.used
        thres <- 1.96/sqrt(n)
    }
    for (i in c(1:20)){
        if (abs(pacf$acf[i])<thres){
            return (i)
        }
    }
    return (-1)
}

# a helper function to remove an element with certain value from a list
remove <- function(arr,value){
    for(i in c(1:length(arr))){
        if(arr[i]==value){
            r_arr <- arr[-i]
            return (r_arr)
        }
    }
    print("remove failure")
    cat(arr," ",value,"\n")
    return (arr)
}

filter_cause <- function(X,y, thres = 0.15){
    n <- dim(X)[2]
    lags <- c()
    R_squared = c()
    for (i in c(1:n)){
        lag = select_time_lags(X[,i])
        r_sq <- self_autoregression(X[,i],y,lag)
        R_squared = append(R_squared,c(r_sq))
    }
#     cat("R2: ",R_squared)

    cause_indices <- c()
    remain <- c(1:n)
    while(length(remain)>0){
        
        j <- which.max(R_squared)
        cause_indices <- append(cause_indices,c(j))
        remain <- remove(remain,j)
        R_squared[j] <- 0
        
        # pruning
        to_remove <- c()
        for(i in remain){
            x=X[,i]
            
            test <- lm(x~as.matrix(X[,cause_indices]))
            r2 <- summary(test)$r.squared
            VIF <- 1/(1-r2)
            if (VIF>10) {
                to_remove <- append(to_remove,c(i))
                R_squared[i] <- 0
            } else{ 
            
            df <- data.frame(x,y,X[,cause_indices])
#             df <- data.frame(x=X[,j],y=y,z=X[,i])
#             pcor <- pcor(df)$estimate[1,2]
#             cat(i,pcor)
            
            
            # pruning based on Granger residual test on residuals
#             cause_fit <- lm(X[,i]~as.matrix(X[,cause_indices]))
#             cause_res <- x - cause_fit$fitted.values
#             effect_fit <- lm(y~as.matrix(X[,cause_indices]))
#             effect_res <- y - effect_fit$fitted.values
#             lag <- select_time_lags(cause_res)
#             p_value <- grangertest(cause_res,effect_res,order = lag)[2,4]
            x = ts(x)
            y = ts(y)
            for (j in cause_indices){
                lag <- select_time_lags(X[,j])
                X_j = ts(X[,j])
                if(j==cause_indices[1]){
                    cause_fit <- dynlm(x ~ L(X_j,-lag:-1))
                    effect_fit <- dynlm(y ~ L(X_j,1:lag))
                } else {
                    cause_fit <- update(cause_fit, . ~ . + L(X_j,-lag:-1))
                    effect_fit <- update(effect_fit, . ~ . + L(X_j,1:lag))
                }
            }
            cause_res <- x - cause_fit$fitted.values
            effect_res <- y - effect_fit$fitted.values
            lag <- select_time_lags(cause_res)
            p_value <- grangertest(cause_res,effect_res,order = lag)[2,4]
            
            if(p_value<=thres){
                #cat(i," ",pcor,"\n")
                to_remove <- append(to_remove,c(i))
                R_squared[i] <- 0
            }
                }
            
        }
        
        for(rm in to_remove){
            remain <- remove(remain,rm)
        }
    }
    return (cause_indices)
}

In [138]:
patterns = c('Food & Snack', 'Grocery & Market', 'History, Museum & Arts', 'Hotel', 'Indoor Entertainment', 'Music', 'Office', 'Outdoor Entertainment', 'Place for Socializing', 'Place for Sports', 'Residence', 'Restaurant', 'School', 'Tobacco & Alcohol', 'Transportation')
cities = c('amsterdam', 'barcelona', 'berlin', 'helsinki', 'london', 'moscow', 'newyork', 'paris', 'prague', 'rome', 'stockholm')

In [142]:
for (city in cities){
print("new iter")
print(city)
result = ""
for (pattern in patterns){
    direct <- ""
    a <- "/Users/suhong/Desktop/UROP_Data/causality/city_granger/"
    c <- ".csv"
    path <- paste(a,city,"_",pattern,c,sep="")
    data <- read.csv(path)
    data <- data[1:287,-1]
    n <- dim(data)[2]
    
    if (!is.null(n)){
    X = data[1:287,-n]
    y = data[1:287,n]
#     for (i in 1:n){
#     p <- adf.test(data[1:287,i])$p.value
#     if(p>0.05){
#         print(pattern)
#         print(i)
#         print("oh no!")
#     }
# }
    if(dim(data)[2]==2){direct=c(1)}
    else{direct <- filter_cause(X,y,thres= 0.05)}

   
    s <- paste(";",pattern,":",sep = "")
    cat(s,direct,"")

} else {
        s <- paste(";",pattern,":",sep = "")
    cat(s,direct,"")
    }

}
    }

[1] "new iter"
[1] "amsterdam"
;Food & Snack: 2 ;Grocery & Market:  ;History, Museum & Arts: 4 2 ;Hotel: 1 ;Indoor Entertainment: 1 ;Music: 2 ;Office: 1 ;Outdoor Entertainment: 2 ;Place for Socializing: 1 2 ;Place for Sports: 1 ;Residence:  ;Restaurant: 3 ;School: 1 ;Tobacco & Alcohol: 1 ;Transportation: 1 [1] "new iter"
[1] "barcelona"
;Food & Snack: 2 3 ;Grocery & Market: 1 ;History, Museum & Arts: 1 ;Hotel: 2 1 ;Indoor Entertainment: 1 2 ;Music: 1 ;Office: 1 ;Outdoor Entertainment:  ;Place for Socializing: 2 ;Place for Sports: 1 ;Residence: 2 ;Restaurant: 3 ;School: 2 3 ;Tobacco & Alcohol: 3 1 ;Transportation: 3 [1] "new iter"
[1] "berlin"
;Food & Snack: 1 ;Grocery & Market: 1 ;History, Museum & Arts: 3 1 ;Hotel: 2 1 ;Indoor Entertainment: 1 3 ;Music: 2 1 ;Office: 2 ;Outdoor Entertainment: 1 ;Place for Socializing: 4 ;Place for Sports: 1 4 ;Residence:  ;Restaurant: 2 1 ;School:  ;Tobacco & Alcohol: 2 3 ;Transportation:  [1] "new iter"
[1] "helsinki"
;Food & Snack: 1 ;Grocery & Marke

In [102]:
city = "amsterdam"
result = ""
for (pattern in patterns){
    direct <- ""
    a <- "/Users/suhong/Desktop/UROP_Data/causality/city_granger/"
    c <- ".csv"
    path <- paste(a,city,"_",pattern,c,sep="")
    data <- read.csv(path)
    data <- data[1:287,-1]
    n <- dim(data)[2]
    
    if (!is.null(n)){
    X = data[1:287,-n]
    y = data[1:287,n]
#     for (i in 1:n){
#     p <- adf.test(data[1:287,i])$p.value
#     if(p>0.05){
#         print(pattern)
#         print(i)
#         print("oh no!")
#     }
# }
    if(dim(data)[2]==2){direct=c(1)}
    else{direct <- filter_cause(X,y,thres= 0.1)}

   
    s <- paste(";",pattern,":",sep = "")
    cat(s,direct,"")

} else {
        s <- paste(";",":",sep = "")
    cat(s,direct,"")
    }

}

;Food & Snack: 1 ;Grocery & Market: 1 ;History, Museum & Arts: 3 6 4 2 ;Hotel: 1 2 ;Indoor Entertainment: 3 1 2 ;Music: 4 2 ;Office: 3 1 ;Outdoor Entertainment: 2 ;Place for Socializing: 1 2 ;Place for Sports: 2 ;Residence: 2 ;Restaurant: 4 5 3 6 ;School: 1 ;Tobacco & Alcohol: 1 2 ;Transportation: 2 

In [133]:
result = ""
city = "barcelona"
pattern = "Hotel"
    direct <- ""
    a <- "/Users/suhong/Desktop/UROP_Data/causality/city_granger/"
    c <- ".csv"
    path <- paste(a,city,"_",pattern,c,sep="")
    data <- read.csv(path)
    data <- data[1:287,-1]
    n <- dim(data)[2]
    
#     if(!is.null(n)){
    X = data[1:287,-n]
    y = data[1:287,n]



In [134]:
head(X)

Unnamed: 0_level_0,Office,School,Transportation
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
1,-2,1,-1
2,1,0,1
3,0,0,-4
4,0,0,2
5,2,1,0
6,-3,-2,-3


In [135]:
cause_indices = c(1,3)
x = ts(X[,2])
            y = ts(y)
            for (j in cause_indices){
                lag <- select_time_lags(X[,j])
                X_j = ts(X[,j])
                if(j==cause_indices[1]){
                    cause_fit <- dynlm(x ~ L(X_j,-lag:-1))
                    effect_fit <- dynlm(y ~ L(X_j,1:lag))
                } else {
                    cause_fit <- update(cause_fit, . ~ . + L(X_j,-lag:-1))
                    effect_fit <- update(effect_fit, . ~ . + L(X_j,1:lag))
                }
            }
            cause_res <- x - cause_fit$fitted.values
            effect_res <- y - effect_fit$fitted.values
            lag <- select_time_lags(cause_res)
#             p_value <- grangertest(cause_res,effect_res,order = lag)[2,4]




In [136]:
lag