<h3>Step 1: Importing neccacary Libararies

In [None]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
conn = sqlite3.connect('iotaDB.db')
import numpy as np

<h3> Step1: Read the CSV

In [None]:
# Load data
df = pd.read_csv('iota_tx_2024/iota_tx/IOTA_1year_tx_data2.csv', header=0)
print(df.columns)  # to check column names

In [None]:
df.info()

 <h3>Step 2: Cleaning the Dataset</h3>



<h4>2.1 Identifying missing values, deleting the rows, saving in a new directory


<h5> 2.1.1 Identifying "not found" values in the dataset

In [None]:
def notfound_values(df):
    # Convert DataFrame to numpy array for fast operation
    data_array = df.values
    
    # Vectorized comparison over the numpy array
    is_not_found = (data_array == 'Not found')
    
    # Use numpy to check each row
    contains_not_found = np.any(is_not_found, axis=1)
    
    # Count the True values for rows containing 'Not found'
    not_found_count = np.sum(contains_not_found)
    
    print(f"Total rows with 'Not found': {not_found_count}")
    return not_found_count


notfound_count = notfound_values(df)

<h5> 2.1.2 Identifying "missing" values in the dataset

In [None]:
def missing_values (df):
  missing_values = df.isna().sum()
  print("Missing values in each column:")
  print(missing_values)
  return missing_values

missing_values_count = missing_values(df)

<h5> 2.1.4 Cleaning the dataset

In [None]:

# Check for 'Not Found' across the entire DataFrame
mask = (df != 'Not found').all(axis=1)

# Count rows before filtering
initial_row_count = len(df)

# Apply the mask to filter out rows with 'Not Found'
cleaned_df = df[mask]

# Count rows after filtering
final_row_count = len(cleaned_df)
rows_deleted = initial_row_count - final_row_count

# Output the number of rows deleted
print(f"Total rows deleted: {rows_deleted}")




In [None]:
# Drop all rows that have any missing values
df = df.dropna()

In [None]:
df = cleaned_df

<h4> 2.2 Adjusting the datatypes 

In [None]:

  # Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

In [None]:
#Display the first 4 rows in the database
df.head()

In [None]:
# Makeing sure about the right datatype
print(type(df['timestamp'].iloc[0]))
print(type(df['input_amounts_x'].iloc[0]))
print(type(df['input_addresses_x'].iloc[0]))
print(type(df['output_addresses_y'].iloc[0]))
print(type(df['output_amounts_y'].iloc[0]))
df.iloc[1]

<h5> 2.1.3.1 Handling the datatype of input_adresses_x and output_addresses_y // Changing all the adresses with more than one input from strings to list

In [None]:
def parse_string_to_list(s):
    try:
        if s is None or isinstance(s, list):
            return []  # Handle None or already processed lists
        parsed_list = ast.literal_eval(s)

        if s is None or s == "Not found":
            print(f"Warning: Skipping due to None or Not found input at row : {s}")
            return []
        return parsed_list
    except (ValueError, SyntaxError, TypeError) as e:
        print(f"Error parsing or converting {s}: {e}")
        return []

In [None]:
df['input_addresses_x'] = df['input_addresses_x'].apply(parse_string_to_list)


In [None]:
df['output_addresses_y'] = df['output_addresses_y'].apply(parse_string_to_list)

In [None]:
print(type(df['input_addresses_x'].iloc[2]))
print(df['input_addresses_x'].iloc[1])

print(type(df['output_addresses_y'].iloc[2]))
print(df['output_addresses_y'].iloc[1])

<h5> 2.1.3.1 Handling the datatype of input_amount_x and output_amount_y

In [None]:
def parse_float_list(s):
    try:
        # Check for 'None' or the specific 'Not found' string
        if s is None or s == "Not found":
            print(f"Warning: Skipping due to None or Not found input at row : {s}")
            return []
        
        # Process the list if 's' is already a list
        if isinstance(s, list):
            return [float(item) for item in s]  # Convert each item to float
        
        # If 's' is a string, attempt to parse it as a literal list
        if isinstance(s, str):
            parsed_list = ast.literal_eval(s)
            return [float(item) for item in parsed_list]
        
        # Log any unexpected data types
        print(f"Unexpected data type : {type(s)} with value {s}")
        return []

    except (ValueError, SyntaxError, TypeError) as e:
        print(f"Error parsing or converting  {s}. Error: {e}")
        return []


In [None]:
df['output_amounts_y'] = df['output_amounts_y'].apply(parse_float_list)
df['input_amounts_x'] = df['input_amounts_x'].apply(parse_float_list)

In [None]:
# Makeing sure about the right datatype
print(type(df['input_amounts_x'].iloc[0]))
print(type(df['input_amounts_x'].iloc[0][0]))
print(type(df['input_addresses_x'].iloc[0]))

print(type(df['output_amounts_y'].iloc[0]))
print(type(df['output_amounts_y'].iloc[0][0]))
print(type(df['output_addresses_y'].iloc[0]))


In [None]:
# Makeing sure about the right datatype

print(type(df['input_addresses_x'].iloc[1]))
print(df['input_addresses_x'].iloc[1])

print(type(df['input_amounts_x'].iloc[1]))
print(df['input_amounts_x'].iloc[1])



print(type(df['output_addresses_y'].iloc[1]))
print(df['output_addresses_y'].iloc[1])

print(type(df['output_amounts_y'].iloc[1]))
print(df['output_amounts_y'].iloc[1])

print(type(df['output_amounts_y'].iloc[1][0]))
print(df['output_amounts_y'].iloc[1][0])




<h3>Step 4: Connecting to the database and inserting the cleaned and adjusted Dataframe</h3>

<h4> 4.1 Saving the list in json format to store in the Database (!List can not be stored in the database)

In [None]:
import json

for col in df.columns:
  if df[col].apply(lambda x: isinstance (x,list)).any():
    df[col] = df[col].apply(json.dumps)

print("DataFrame Structure:")
print(df.dtypes)

<h4> 4.2 Creating the Database and createing the Table for the Datasets

In [None]:
# Create a table with corrected column names and data types
conn = sqlite3.connect('iotaDB.db')
c = conn.cursor()
c.execute("DROP TABLE Transactions")
c.execute('''
CREATE TABLE Transactions (
    transaction_id ,
    block_index ,
    input_addresses_x ,
    input_amounts_x ,
    output_addresses_y ,
    output_amounts_y ,
    output_timestamp 
)
''')

# Use a default single insert statement per row
df.to_sql('Transactions', conn, if_exists='replace', index=False, method=None)


# Commit changes and close the connection
conn.commit()
conn.close()





Working with JSON strings in the Data

Verification of List in the Database

In [None]:
import json
import sqlite3

# Example database fetch code
conn = sqlite3.connect('iotaDB.db')
c = conn.cursor()

# Query to fetch serialized data
c.execute("""
SELECT input_addresses_x, output_addresses_y, input_amounts_x, output_amounts_y 
FROM Transactions 
LIMIT 10
""")
rows = c.fetchall()

# Deserialize the JSON string back into Python lists for all four columns
addresses_and_amounts = [{
    'input_addresses': json.loads(row[0]) if row[0] else None,
    'output_addresses': json.loads(row[1]) if row[1] else None,
    'input_amounts': json.loads(row[2]) if row[2] else None,
    'output_amounts': json.loads(row[3]) if row[3] else None
} for row in rows]

# Output the deserialized data along with types
for idx, item in enumerate(addresses_and_amounts, start=1):
    print(f"Record {idx} - Input Addresses: {item['input_addresses']} (type: {type(item['input_addresses'])})")
    print(f"Record {idx} - Output Addresses: {item['output_addresses']} (type: {type(item['output_addresses'])})")
    print(f"Record {idx} - Input Amounts: {item['input_amounts']} (type: {type(item['input_amounts'])})")
    print(f"Record {idx} - Output Amounts: {item['output_amounts']} (type: {type(item['output_amounts'])})")

# Close the database connection


In [None]:
import json
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('iotaDB.db')
c = conn.cursor()

# Execute query to fetch serialized data
c.execute("""
SELECT input_amounts_x 
FROM Transactions 
LIMIT 10
""")
rows = c.fetchall()

# Deserialize the JSON string back into Python lists
input_amounts = [json.loads(row[0]) if row[0] else None for row in rows]

# Close the database connection
conn.close()


# Check and print the type of each element in the lists

In [181]:
# Check and print the type of each element in the lists
for idx, amounts in enumerate(input_amounts, start=1):
    if amounts:
        print(f"Record {idx} - Input Amounts Types:")
        for i, amount in enumerate(amounts):
            print(f"  Element {i+1}: {amount} (type: {type(amount)})")
    else:
        print(f"Record {idx} - No input amounts available.")


Record 1 - Input Amounts Types:
  Element 1: 500000000.0 (type: <class 'float'>)
Record 2 - Input Amounts Types:
  Element 1: 60000000.0 (type: <class 'float'>)
  Element 2: 76000000.0 (type: <class 'float'>)
Record 3 - Input Amounts Types:
  Element 1: 200000000.0 (type: <class 'float'>)
Record 4 - Input Amounts Types:
  Element 1: 25000000.0 (type: <class 'float'>)
Record 5 - Input Amounts Types:
  Element 1: 494500000.0 (type: <class 'float'>)
Record 6 - Input Amounts Types:
  Element 1: 29891752444.0 (type: <class 'float'>)
Record 7 - Input Amounts Types:
  Element 1: 22994863356.0 (type: <class 'float'>)
Record 8 - Input Amounts Types:
  Element 1: 1000000.0 (type: <class 'float'>)
Record 9 - Input Amounts Types:
  Element 1: 4501456962.0 (type: <class 'float'>)
Record 10 - Input Amounts Types:
  Element 1: 0.0 (type: <class 'float'>)
