In [45]:
import numpy as np
import pandas as pd


# Loading Data As CSV:

In [32]:
import pandas as pd
from pprint import pprint
  
df4 = pd.read_csv('Incidents_service.csv')
  


In [33]:
df4.columns

Index(['ID', 'ID_status', 'active', 'count_reassign', 'count_opening',
       'count_updated', 'ID_caller', 'opened_by', 'opened_time', 'Created_by',
       'created_at', 'updated_by', 'updated_at', 'type_contact', 'location',
       'category_ID', 'user_symptom', 'impact', 'Support_group',
       'support_incharge', 'Doc_knowledge', 'confirmation_check', 'notify',
       'problem_id', 'change request'],
      dtype='object')

In [34]:
len(df4)

141712

# Loading Data in Chunks:

In [35]:
df5 = pd.read_csv('Incidents_service.csv', chunksize=10000)
print(df5)

<pandas.io.parsers.readers.TextFileReader object at 0x000002C2AA4A4040>


In [36]:
len(df5)

TypeError: object of type 'TextFileReader' has no len()

As we used chunksize,  dataframe is converted into 'TextFileReader' and it will not supprot the preprocessing functions like pandas dataframe

In [37]:
for data1 in df5:
    print(data1.shape)

(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(10000, 25)
(1712, 25)


# Loading Data in Chunks and After that Concat it to make a dataframe:

In [38]:
tp = pd.read_csv('Incidents_service.csv', iterator=True, chunksize=10000)

In [39]:
df6 = pd.concat(tp, ignore_index=True)

In [40]:
len(df6)

141712

In [42]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141712 entries, 0 to 141711
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   ID                  141712 non-null  object
 1   ID_status           141712 non-null  object
 2   active              141712 non-null  bool  
 3   count_reassign      141712 non-null  int64 
 4   count_opening       141712 non-null  int64 
 5   count_updated       141712 non-null  int64 
 6   ID_caller           141712 non-null  object
 7   opened_by           141712 non-null  object
 8   opened_time         141712 non-null  object
 9   Created_by          141712 non-null  object
 10  created_at          141712 non-null  object
 11  updated_by          141712 non-null  object
 12  updated_at          141712 non-null  object
 13  type_contact        141712 non-null  object
 14  location            141712 non-null  object
 15  category_ID         141712 non-null  object
 16  us

# Multiprocessing using pandas:

In [43]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import multiprocessing as mp

In [44]:
%%time
df7 = pd.read_csv("Incidents_service.csv", chunksize=10000)
total_length = 0
for chunk in df7:
    total_length += len(chunk)
print(total_length)

141712
Wall time: 1.47 s


In [46]:
%%time
LARGE_FILE = "Incidents_service.csv"
CHUNKSIZE = 10000 # processing 10,000 rows at a time

def process_frame(df):
        # process data frame
        return len(df)

if __name__ == '__main__':
        reader = pd.read_table(LARGE_FILE, chunksize=CHUNKSIZE)
        pool = mp.Pool(4) # use 4 processes

        funclist = []
        for df in reader:
                # process each data frame
                f = pool.apply_async(process_frame,[df])
                funclist.append(f)
                
        result = 0
        for f in funclist:
                result += f.get(timeout=10) # timeout in 10 seconds

        print (f"There are {result} rows of data")

TimeoutError: 

# Dask Instead of Pandas:

In [47]:
import dask.dataframe as dd
data = dd.read_csv("Incidents_service.csv",dtype={'MachineHoursCurrentMeter': 'float64'},assume_missing=True)
data.compute()

Unnamed: 0,ID,ID_status,active,count_reassign,count_opening,count_updated,ID_caller,opened_by,opened_time,Created_by,...,category_ID,user_symptom,impact,Support_group,support_incharge,Doc_knowledge,confirmation_check,notify,problem_id,change request
0,INC0000045,New,True,0.0,0.0,0.0,Caller 2403,Opened by 8,29-02-2016 01:16,Created by 6,...,Category 55,Symptom 72,2 - Medium,Group 56,Resolver 17,True,False,Do Not Notify,?,?
1,INC0000045,Resolved,True,0.0,0.0,3.0,Caller 2403,Opened by 8,29-02-2016 01:16,Created by 6,...,Category 55,Symptom 72,2 - Medium,Group 56,Resolver 17,True,False,Do Not Notify,?,?
2,INC0000045,Closed,False,0.0,0.0,4.0,Caller 2403,Opened by 8,29-02-2016 01:16,Created by 6,...,Category 55,Symptom 72,2 - Medium,Group 56,Resolver 17,True,False,Do Not Notify,?,?
3,INC0000047,Active,True,1.0,0.0,1.0,Caller 2403,Opened by 397,29-02-2016 04:40,Created by 171,...,Category 40,Symptom 471,2 - Medium,Group 24,Resolver 31,True,False,Do Not Notify,?,?
4,INC0000047,Active,True,1.0,0.0,2.0,Caller 2403,Opened by 397,29-02-2016 04:40,Created by 171,...,Category 40,Symptom 471,2 - Medium,Group 24,Resolver 31,True,False,Do Not Notify,?,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141707,INC0119457,Closed,False,0.0,0.0,1.0,Caller 2403,Opened by 508,13-02-2017 11:31,Created by 10,...,Category 50,Symptom 533,3 - Low,Group 70,Resolver 17,False,False,Do Not Notify,?,?
141708,INC0119983,New,True,0.0,0.0,0.0,Caller 831,Opened by 508,14-02-2017 11:11,Created by 10,...,Category 50,Symptom 533,3 - Low,Group 70,Resolver 17,False,False,Do Not Notify,?,?
141709,INC0120303,New,True,0.0,0.0,0.0,Caller 1866,Opened by 508,15-02-2017 01:52,Created by 10,...,Category 50,Symptom 533,3 - Low,Group 70,Resolver 17,False,False,Do Not Notify,?,?
141710,INC0120319,New,True,0.0,0.0,0.0,Caller 1899,Opened by 508,15-02-2017 07:09,Created by 10,...,Category 50,Symptom 533,3 - Low,Group 70,Resolver 17,False,False,Do Not Notify,?,?
