# Step1 : Import libraries

In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd

# Step2 : Working with 3GB data

### 2.1 Download the 3GB data from figshare via API

In [2]:
# API request
article_id = 14226968
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshareairline_3GB/"
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)
files = data["files"]
files

[{'id': 26844650,
  'name': 'allyears.csv.zip',
  'size': 2405908113,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26844650',
  'supplied_md5': '9e046ac05ecd2c32a256a47dd1098b81',
  'computed_md5': '9e046ac05ecd2c32a256a47dd1098b81'},
 {'id': 26863682,
  'name': 'individual_years.zip',
  'size': 1896206676,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/26863682',
  'supplied_md5': '921da748974b07b2a70bbfcc04535a77',
  'computed_md5': '921da748974b07b2a70bbfcc04535a77'},
 {'id': 27515426,
  'name': 'combined_model_data.csv.zip',
  'size': 821308997,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/files/27515426',
  'supplied_md5': '7638434c44a7d29cbb29fe200b4fd65d',
  'computed_md5': '7638434c44a7d29cbb29fe200b4fd65d'},
 {'id': 27520682,
  'name': 'combined_model_data_parti.parquet.zip',
  'size': 519743915,
  'is_link_only': False,
  'download_url': 'https://ndownloader.figshare.com/

### 2.2 Extract the zip file

In [3]:
# Create zip file of individual_years.zip

In [4]:
%%time
files_to_dl = ["individual_years.zip"]
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

CPU times: user 8.5 s, sys: 27.6 s, total: 36.1 s
Wall time: 3min 14s


In [5]:
# Extract data into output directory

In [6]:
%%time
with zipfile.ZipFile(os.path.join(output_directory, "individual_years.zip"), 'r') as f:
    f.extractall(output_directory)

CPU times: user 17.7 s, sys: 1.4 s, total: 19.1 s
Wall time: 19.3 s


### 2.3 Combine CSV files

In [7]:
%%time
## here we are using a normal python way for merging the data 
import pandas as pd
use_cols = ["ArrDelay", "DepDelay", "Distance", "TailNum","UniqueCarrier","Origin","Dest"]
files = glob.glob('figshareairline_3GB/individual_years/*.csv')
df = pd.concat((pd.read_csv(file, index_col=0, usecols=use_cols)
                .assign(year=re.findall("[0-9]+", file)[0])
                for file in files)
              )
df.to_csv("figshareairline_3GB/combined_data.csv")

CPU times: user 2min 45s, sys: 12.5 s, total: 2min 58s
Wall time: 2min 59s


### 2.4 Check occupied space

In [8]:
%%sh
du -sh figshareairline_3GB/combined_data.csv

2.9G	figshareairline_3GB/combined_data.csv


### 2.5 : Perform Simple EDA

In [9]:
%%time
df = pd.read_csv("figshareairline_3GB/combined_data.csv")
print(df[(df.year ==2004) & (df.ArrDelay >10)]["UniqueCarrier"].value_counts())

Series([], Name: UniqueCarrier, dtype: int64)
CPU times: user 24.3 s, sys: 10.6 s, total: 34.9 s
Wall time: 41 s


Summary:
| DataSize | Operating System | RAM | Processor | Is SSD | Space occupied | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|:----------:|
| 3GB    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |     2.9GB     |   39.6s   |
| 10GB    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |            |      |
| 20GB    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |            |      |

# Step3 : Working with 10GB data

### 3.1 Check occupied space

The 10GB Airline data can be found [here.](https://drive.google.com/file/d/1cf88O_KAWy0V2GS-WvysHkd6z0nsOSBL/view)

In [10]:
%%sh
du -sh figshareairline_10GB/combined_10gb.csv

9.3G	figshareairline_10GB/combined_10gb.csv


### 3.2 : Perform Simple EDA

In [11]:
%%time
df = pd.read_csv("figshareairline_10GB/combined_10gb.csv")
print(df[(df.year ==2004) & (df.ArrDelay >10)]["UniqueCarrier"].value_counts())

WN    675501
DL    567003
AA    529155
MQ    391533
UA    380076
NW    375090
US    285978
XE    283401
OO    261984
OH    251397
CO    224967
EV    204897
DH    198990
HP    165816
FL    128733
AS    126561
B6     58263
TZ     53253
HA     10926
Name: UniqueCarrier, dtype: int64
CPU times: user 1min 13s, sys: 35.7 s, total: 1min 49s
Wall time: 2min 25s


Summary:
| DataSize | Operating System | RAM | Processor | Is SSD | Space occupied | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|:----------:|
| 3GB    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |     2.9GB     |   39.6s   |
| 10GB    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |    9.3GB     |   2min 22s   |
| 20GB    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |            |      |

# Step4 : Working with 20GB data

### 4.1 Check occupied space

The 20GB Airline data can be found [here.](https://drive.google.com/file/d/1mAqhWXpjifT1btvrK97tMv6tZ6wReYO3/view)

In [12]:
%%sh
du -sh figshareairline_20GB/combined_20gb.csv

 19G	figshareairline_20GB/combined_20gb.csv


### 4.2 : Perform Simple EDA

In [None]:
%%time
df = pd.read_csv("figshareairline_20GB/combined_20gb.csv")
print(df[(df.year ==2004) & (df.ArrDelay >10)]["UniqueCarrier"].value_counts())

Error: 
Kernel Restarting

The kernel for Personal/big_data_analysis/big1_how_big_we_can_run.ipynb appears to have died. It will restart automatically.

Summary:
| DataSize | Operating System | RAM | Processor | Is SSD | Space occupied | Time taken |
|:-----------:|:----------------:|:---:|:---------:|:------:|:----------:|:----------:|
| 3GB    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |     2.9GB     |   39.6s   |
| 10GB    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |    9.3GB     |   2min 22s   |
| 20GB    |          MacOS        |  16GB   |    Apple M1 Pro       |   Yes     |    19.0G     |   Couldn't be able to run   |