## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.metrics import mean_squared_error
from math import sqrt
import glob
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.model_selection import train_test_split
from pandas.plotting import register_matplotlib_converters
from tensorflow import keras

## Read data from files, pickle them seperately

In [None]:
# Change the path to a seperate month, if you want to pickle the data a month at a time
joined_files = os.path.join("../data_raw", "sms-call-internet-mi*.txt")
joined_list = glob.glob(joined_files)

dfs = []
for f in joined_list:
    dfs.append(pd.read_csv(f, sep='\t',header=None, usecols=[0, 1, 7], names=["SquareId", "TimeInterval", "InternetActivity"]))
    dfs[-1] = dfs[-1].dropna()
df_big = pd.concat(dfs, ignore_index=True)

# prefix 'big' for full dataset of all grids, 01 for november 2013
df_big.to_pickle("../data_pickles/big_02.pkl")

print(len(df_big))
df_big.head(20)

## Grab specific grid data - old method

In [None]:
# Read from big_[month starting from 01 - november 2013].pkl
df_big = pd.read_pickle("../data_pickles/big_01.pkl")

df = df_big[df_big['SquareId'] == 1]
df = df.sort_values(by='TimeInterval')
df = df.drop('SquareId', axis=1)
df = df.groupby('TimeInterval').sum()
#df.set_index('TimeInterval', inplace=True)
df.index = pd.to_datetime(df.index, unit='ms')

df.to_pickle("../data_pickles/grid1_02.pkl")

print(df.shape)
df.head(20)

## Concat data

In [None]:
df_01 = pd.read_pickle("../data_pickles/big_01.pkl")
df_02 = pd.read_pickle("../data_pickles/big_02.pkl")

df_all = pd.concat([df_01, df_02], ignore_index=True)

print(len(df_all))
df_all.head(20)

In [None]:
df_all.to_pickle("../data_pickles/all.pkl")

## Grab Specific data (new)

In [4]:
# Returns index value of grid based on (row, col) numbers specified
def get_grid_id(row_loc, col_loc):
    
    num_rows = 100
    num_cols = 100
    return (num_rows - row_loc - 1)*100 + col_loc + 1

In [6]:
# Pickle all for square 1
df_all = pd.read_pickle("../data_pickles/all.pkl")

grid_val = get_grid_id(70,80)
df = df_all[df_all['SquareId'] == grid_val]

#df = df_all[df_all['SquareId'] == 1]
df = df.sort_values(by='TimeInterval')
df = df.drop('SquareId', axis=1)
df = df.groupby('TimeInterval').sum()
#df.set_index('TimeInterval', inplace=True)
df.index = pd.to_datetime(df.index, unit='ms')

print(df.shape)
df.head(20)

(8928, 1)


Unnamed: 0_level_0,InternetActivity
TimeInterval,Unnamed: 1_level_1
2013-10-31 23:00:00,11.028366
2013-10-31 23:10:00,11.127101
2013-10-31 23:20:00,10.892771
2013-10-31 23:30:00,8.622425
2013-10-31 23:40:00,8.009927
2013-10-31 23:50:00,8.11842
2013-11-01 00:00:00,8.02627
2013-11-01 00:10:00,8.514179
2013-11-01 00:20:00,6.833425
2013-11-01 00:30:00,6.554605


In [None]:
print(df['InternetActivity'].mean())
df['InternetActivity'].plot(figsize=(12,5));

In [None]:
# Save as grid<x>_<y>_<all>.pkl
df.to_pickle("../data_pickles/grid70_80_all.pkl")