In [4]:
import pandas as pd

### Limitations of to_json():

- Serialization Time: Converting a massive DataFrame to JSON is CPU-intensive.  
- Memory Overhead: to_json() requires the entire object to fit into memory before converting it to a JSON string.  
- Output Size: The resulting JSON string could be several GBs in size, depending on the data.  
- Writing this JSON string to disk can also take a long time.

In [5]:
data = pd.read_csv(r'..\network_data\phishingData.csv')
data.reset_index(drop=True,inplace=True)
data.head()

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,1,3,1,5,72,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,2,3,1,3,144,0,0,0,0,2,...,0,0,0,1,-1,1,1,1,1,1
2,3,3,1,2,58,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,4,3,1,6,79,1,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,5,3,0,4,46,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


In [6]:
data['CLASS_LABEL'].value_counts()

CLASS_LABEL
1    5000
0    5000
Name: count, dtype: int64

In [10]:
data[data.isnull().any(axis=1)]

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL


In [10]:
ans = data.iloc[0:2,:].T.to_json()
print(type(ans))
print(ans)

<class 'str'>
{"0":{"id":1.0,"NumDots":3.0,"SubdomainLevel":1.0,"PathLevel":5.0,"UrlLength":72.0,"NumDash":0.0,"NumDashInHostname":0.0,"AtSymbol":0.0,"TildeSymbol":0.0,"NumUnderscore":0.0,"NumPercent":0.0,"NumQueryComponents":0.0,"NumAmpersand":0.0,"NumHash":0.0,"NumNumericChars":0.0,"NoHttps":1.0,"RandomString":0.0,"IpAddress":0.0,"DomainInSubdomains":0.0,"DomainInPaths":0.0,"HttpsInHostname":0.0,"HostnameLength":21.0,"PathLength":44.0,"QueryLength":0.0,"DoubleSlashInPath":0.0,"NumSensitiveWords":0.0,"EmbeddedBrandName":0.0,"PctExtHyperlinks":0.0,"PctExtResourceUrls":0.25,"ExtFavicon":1.0,"InsecureForms":1.0,"RelativeFormAction":0.0,"ExtFormAction":0.0,"AbnormalFormAction":0.0,"PctNullSelfRedirectHyperlinks":0.0,"FrequentDomainNameMismatch":0.0,"FakeLinkInStatusBar":0.0,"RightClickDisabled":0.0,"PopUpWindow":0.0,"SubmitInfoToEmail":0.0,"IframeOrFrame":0.0,"MissingTitle":0.0,"ImagesOnlyInForm":1.0,"SubdomainLevelRT":1.0,"UrlLengthRT":0.0,"PctExtResourceUrlsRT":1.0,"AbnormalExtFormActio

If you need to work with such large data, consider these options:  

Chunking:  
- Process the DataFrame in smaller chunks to avoid memory issues.  
- Example: 

    chunk_size = 100000  # Adjust based on available memory  
    for chunk in pd.read_csv('large_file.csv', chunksize=chunk_size):  
        chunk.T.to_json('output.json', orient='records', lines=True)  # Append to file  

Dask or PySpark:

    import dask.dataframe as dd
    df = dd.read_csv('large_file.csv')
    df.T.to_json('output.json')  # Dask handles memory better 

In [15]:
import json

# The function json.loads() from the json module converts a JSON string into a Python dictionary
json_data = json.loads(data.iloc[0:2,:].T.to_json()).values()
print(type(json_data))
print(list(json_data))
print(json_data)

<class 'dict_values'>
[{'id': 1.0, 'NumDots': 3.0, 'SubdomainLevel': 1.0, 'PathLevel': 5.0, 'UrlLength': 72.0, 'NumDash': 0.0, 'NumDashInHostname': 0.0, 'AtSymbol': 0.0, 'TildeSymbol': 0.0, 'NumUnderscore': 0.0, 'NumPercent': 0.0, 'NumQueryComponents': 0.0, 'NumAmpersand': 0.0, 'NumHash': 0.0, 'NumNumericChars': 0.0, 'NoHttps': 1.0, 'RandomString': 0.0, 'IpAddress': 0.0, 'DomainInSubdomains': 0.0, 'DomainInPaths': 0.0, 'HttpsInHostname': 0.0, 'HostnameLength': 21.0, 'PathLength': 44.0, 'QueryLength': 0.0, 'DoubleSlashInPath': 0.0, 'NumSensitiveWords': 0.0, 'EmbeddedBrandName': 0.0, 'PctExtHyperlinks': 0.0, 'PctExtResourceUrls': 0.25, 'ExtFavicon': 1.0, 'InsecureForms': 1.0, 'RelativeFormAction': 0.0, 'ExtFormAction': 0.0, 'AbnormalFormAction': 0.0, 'PctNullSelfRedirectHyperlinks': 0.0, 'FrequentDomainNameMismatch': 0.0, 'FakeLinkInStatusBar': 0.0, 'RightClickDisabled': 0.0, 'PopUpWindow': 0.0, 'SubmitInfoToEmail': 0.0, 'IframeOrFrame': 0.0, 'MissingTitle': 0.0, 'ImagesOnlyInForm': 1.0,