# Nasa_Defects _-_ 01 _-_ Clean

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
sns.set_style("darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "NASA"

ROOT = "./"
COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"

DEBUG = False
SEED = 666

In [2]:
if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(d): os.makedirs(d)
  if not os.path.isdir(ROOT): os.makedirs(ROOT)

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

### Dataset

In [3]:
df = pd.read_csv(ROOT+"/orig/jm1.csv")
print(df.shape)
df.head(5)

(10878, 22)


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label
0,447.0,826.0,12.0,157.0,470.0,385.0,113.0,2824.0,210.28,384.45,31079782.27,26.95,8441.0,0.0,1726654.57,80843.08,3021.0,5420.0,609.0,155.0,3442.0,1
1,0.0,211.0,0.0,0.0,128.0,104.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1129.0,1
2,164.0,485.0,10.0,58.0,268.0,219.0,39.0,1588.0,202.98,213.53,9254819.86,14.45,4828.0,0.0,514156.64,43342.31,1730.0,3172.0,407.0,102.0,1824.0,1
3,37.0,29.0,8.0,42.0,19.0,19.0,6.0,133.0,108.14,46.32,232043.52,1.67,685.0,0.02,12891.31,5009.32,295.0,390.0,121.0,38.0,222.0,1
4,11.0,405.0,0.0,17.0,404.0,2.0,1.0,814.0,101.2,206.01,4294926.45,6.95,2033.0,0.0,238607.05,20848.47,813.0,1220.0,811.0,411.0,844.0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10878 entries, 0 to 10877
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   LOC_BLANK              10878 non-null  float64
 1   BRANCH_COUNT           10878 non-null  float64
 2   LOC_CODE_AND_COMMENT   10878 non-null  float64
 3   LOC_COMMENTS           10878 non-null  float64
 4   CYCLOMATIC_COMPLEXITY  10878 non-null  float64
 5   DESIGN_COMPLEXITY      10878 non-null  float64
 6   ESSENTIAL_COMPLEXITY   10878 non-null  float64
 7   LOC_EXECUTABLE         10878 non-null  float64
 8   HALSTEAD_CONTENT       10878 non-null  float64
 9   HALSTEAD_DIFFICULTY    10878 non-null  float64
 10  HALSTEAD_EFFORT        10878 non-null  float64
 11  HALSTEAD_ERROR_EST     10878 non-null  float64
 12  HALSTEAD_LENGTH        10878 non-null  float64
 13  HALSTEAD_LEVEL         10878 non-null  float64
 14  HALSTEAD_PROG_TIME     10878 non-null  float64
 15  HA

### Data Quality Analysis by Features

#### Identical Features A

Used transpose to make it change from rows to columns

In [5]:
transposed_df = df.transpose()
identical_columns = transposed_df[transposed_df.duplicated(keep=False)]
identical_features = set(identical_columns.index)

print("Identical features:")
identical_features

Identical features:


set()

#### Constant Features B

used nunique to check for constant features

In [6]:
constant_features = df.columns[df.nunique() == 1]

print("Constant features:")
constant_features

Constant features:


Index([], dtype='object')

#### Features with missing values C

In [7]:
features_with_missing_values = df.columns[df.isnull().any()].tolist()

print("features with missing values:")
features_with_missing_values

features with missing values:


[]

#### Conflicting Features D

Stored all checks in a list.
Created a function that runs through all the checks and adds all conflicting features.

In [8]:
from numpy import log2

checks = [
    (df['LOC_TOTAL'] >= df['LOC_EXECUTABLE']),
    (df['LOC_TOTAL'] >= df['LOC_CODE_AND_COMMENT']),
    (df['NUM_OPERANDS'] >= df['NUM_UNIQUE_OPERANDS']),
    (df['NUM_OPERATORS'] >= df['NUM_UNIQUE_OPERATORS']),
    np.isclose(df['HALSTEAD_LENGTH'], df['NUM_OPERATORS'] + df['NUM_OPERANDS']),
    (df['CYCLOMATIC_COMPLEXITY'] <= (df['NUM_OPERATORS'] + 1)),
    np.isclose(df['HALSTEAD_VOLUME'], (df['NUM_OPERATORS'] + df['NUM_OPERANDS']) * np.log2(df['NUM_UNIQUE_OPERATORS'] + df['NUM_UNIQUE_OPERANDS'])),
    np.isclose(df['HALSTEAD_LEVEL'], (2 / df['NUM_UNIQUE_OPERATORS']) * (df['NUM_UNIQUE_OPERANDS'] / df['NUM_OPERANDS'])),
    np.isclose(df['HALSTEAD_DIFFICULTY'], (df['NUM_UNIQUE_OPERATORS'] / 2) * (df['NUM_OPERANDS'] / df['NUM_UNIQUE_OPERANDS'])),
    np.isclose(df['HALSTEAD_CONTENT'], df['HALSTEAD_VOLUME'] / df['HALSTEAD_DIFFICULTY']),
    np.isclose(df['HALSTEAD_EFFORT'], df['HALSTEAD_VOLUME'] * df['HALSTEAD_DIFFICULTY']),
    np.isclose(df['HALSTEAD_PROG_TIME'], df['HALSTEAD_EFFORT'] / 18)
]

def check_referential_integrity(df, checks):
    conflicting_features = sum(not all(check) for check in checks)
    return conflicting_features

conflicting_count = check_referential_integrity(df, checks)
print(f"Number of features with conflicting values: {conflicting_count}")

Number of features with conflicting values: 8


  result = getattr(ufunc, method)(*inputs, **kwargs)


#### Implausible Values E

In [9]:
# Check for LOC_TOTAL being 0
loc_total_zero = df[df['LOC_TOTAL'] == 0]
loc_total_zero
        
# Check for any attribute having a value less than 0
negative_values = df[df < 0].dropna()
negative_values
        
# Check for any count being a non-integer
non_integer_counts = df[df.applymap(lambda x: not isinstance(x, int))].dropna()
non_integer_counts

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label


#### Total Problem Features F

In [10]:
# Function to check for identical features (Condition A)
def check_identical_features(df):
    return identical_features

# Function to check for constant features (Condition B)
def check_constant_features(df):
    return constant_features

# Function to check for features with missing values (Condition C)
def check_missing_values(df):
    return features_with_missing_values

# Function to check for conflicting values (Condition D)
def check_conflicting_values(df):
    conflicting_features = set()
    for feature, check in zip(df.columns, checks):
        if not all(check):
            conflicting_features.add(feature)

    return conflicting_features

# Check for implausible values (Condition E)
def check_implausible_values(df):
    implausible_features = set()

    # Check for LOC_TOTAL being 0
    loc_total_zero = df[df['LOC_TOTAL'] == 0]
    if not loc_total_zero.empty:
        print("Rows where LOC_TOTAL is 0:")
        print(loc_total_zero)
        implausible_features.add('LOC_TOTAL')

    # Check for any attribute having a value less than 0
    negative_values = df[df < 0].dropna()
    if not negative_values.empty:
        print("Rows with negative values:")
        print(negative_values)
        implausible_features.update(negative_values.columns)

    # Check for any count being a non-integer
    non_integer_counts = df[df.applymap(lambda x: not isinstance(x, int))].dropna()
    if not non_integer_counts.empty:
        print("Rows with non-integer counts:")
        print(non_integer_counts)
        implausible_features.update(non_integer_counts.columns)

    return implausible_features

# Function to calculate total problem features (Condition F)
def total_problem_features(df):
    total_features = set()
    total_features.update(check_identical_features(df))
    total_features.update(check_constant_features(df))
    total_features.update(check_missing_values(df))
    total_features.update(check_conflicting_values(df))
    total_features.update(check_implausible_values(df))
    return len(total_features)

# Calculate and print total problem features
total_problems = total_problem_features(df)
print(f"Total problem features: {total_problems}")

Total problem features: 8


### Data Quality Analysis by Cases

#### Identical Cases G

In [11]:
# Find and display identical cases (rows)
identical_cases = df[df.duplicated(keep=False)]
print("Identical cases:")
identical_cases

Identical cases:


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label
90,1.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,11.23,2.00,44.92,0.01,8.0,0.50,2.50,22.46,3.0,5.0,3.0,4.0,6.0,0
153,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,11.0,1
155,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,7.0,1
217,0.0,7.0,0.0,0.0,4.0,3.0,1.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,28.0,0
335,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,20.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10865,4.0,11.0,2.0,0.0,6.0,4.0,1.0,29.0,22.09,23.75,12459.65,0.17,104.0,0.04,692.20,524.62,35.0,69.0,14.0,19.0,37.0,0
10867,4.0,11.0,2.0,0.0,6.0,4.0,1.0,29.0,22.09,23.75,12459.65,0.17,104.0,0.04,692.20,524.62,35.0,69.0,14.0,19.0,37.0,0
10869,3.0,3.0,0.0,0.0,2.0,1.0,1.0,5.0,22.38,3.67,300.83,0.03,21.0,0.27,16.71,82.04,11.0,10.0,9.0,6.0,10.0,0
10870,1.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,5.33,1.50,12.00,0.00,4.0,0.67,0.67,8.00,1.0,3.0,1.0,3.0,5.0,0


#### Inconsistent Cases H

In [12]:
# Separate features and target column
target = 'label'
features = df.drop(columns=['label'])

In [13]:
def check_inconsistent_cases(df):
    # Find inconsistent cases directly
    inconsistent_cases = df[df.duplicated(subset=features, keep=False)] \
        .groupby(list(features)).filter(lambda x: x[target].nunique() > 1)

    inconsistent_cases = len(inconsistent_cases)

    print(f"Number of inconsistent cases: {inconsistent_cases}")

    return inconsistent_cases

check_inconsistent_cases(df)

Number of inconsistent cases: 889


889

#### Cases with missing values I

In [37]:
cases_with_missing_values = df[df.isnull().any(axis=1)]

print("Cases with Missing Values:")
cases_with_missing_values

Cases with Missing Values:


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label


#### Cases with conflicting feature values J

In [15]:
checks = [
    (df['LOC_TOTAL'] >= df['LOC_EXECUTABLE']),
    (df['LOC_TOTAL'] >= df['LOC_CODE_AND_COMMENT']),
    (df['NUM_OPERANDS'] >= df['NUM_UNIQUE_OPERANDS']),
    (df['NUM_OPERATORS'] >= df['NUM_UNIQUE_OPERATORS']),
    np.isclose(df['HALSTEAD_LENGTH'], df['NUM_OPERATORS'] + df['NUM_OPERANDS']),
    (df['CYCLOMATIC_COMPLEXITY'] <= (df['NUM_OPERATORS'] + 1)),
    np.isclose(df['HALSTEAD_VOLUME'], (df['NUM_OPERATORS'] + df['NUM_OPERANDS']) * np.log2(df['NUM_UNIQUE_OPERATORS'] + df['NUM_UNIQUE_OPERANDS'])),
    np.isclose(df['HALSTEAD_LEVEL'], (2 / df['NUM_UNIQUE_OPERATORS']) * (df['NUM_UNIQUE_OPERANDS'] / df['NUM_OPERANDS'])),
    np.isclose(df['HALSTEAD_DIFFICULTY'], (df['NUM_UNIQUE_OPERATORS'] / 2) * (df['NUM_OPERANDS'] / df['NUM_UNIQUE_OPERANDS'])),
    np.isclose(df['HALSTEAD_CONTENT'], df['HALSTEAD_VOLUME'] / df['HALSTEAD_DIFFICULTY']),
    np.isclose(df['HALSTEAD_EFFORT'], df['HALSTEAD_VOLUME'] * df['HALSTEAD_DIFFICULTY']),
    np.isclose(df['HALSTEAD_PROG_TIME'], df['HALSTEAD_EFFORT'] / 18)
]

# Created a DataFrame of boolean values indicating conflicts for each case
conflict_df = pd.DataFrame({f'Check_{i+1}': ~check for i, check in enumerate(checks)})

# Count cases with at least one conflicting feature value
conflicting_cases = conflict_df.any(axis=1).sum()

print("Number of cases with conflicting feature values:", conflicting_cases)

Number of cases with conflicting feature values: 10767


  result = getattr(ufunc, method)(*inputs, **kwargs)


#### Implausible Cases K

In [16]:
# Check for LOC_TOTAL being 0
loc_total_zero = df[df['LOC_TOTAL'] == 0]
print("Cases where LOC_TOTAL is 0:")
loc_total_zero

# Check for any attribute having a value less than 0
negative_values = df[df < 0].dropna()
print("\nCases with negative values:")
negative_values

# Check for any count being a non-integer
non_integer_counts = df[df.applymap(lambda x: not isinstance(x, int))].dropna()
print("\nCases with non-integer counts:")
non_integer_counts

Cases where LOC_TOTAL is 0:

Cases with negative values:

Cases with non-integer counts:


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label


#### Total Problem cases impacted by one or more of I to K | L

Cases with missing values returns 22 when I already checked and it was 0.

In [17]:
# Function to check for cases with missing values
def check_case_missing_values(df):
    return cases_with_missing_values

def check_case_implausible_values(df):
    implausible_cases = set()
    implausible_cases.add('LOC_TOTAL')
    implausible_cases.update(negative_values.columns)
    implausible_cases.update(non_integer_counts.columns)

    return implausible_cases

def total_problem_cases(df):
    total_cases = set()
    total_cases.update(check_case_missing_values(df))
    total_cases.update(check_case_implausible_values(df))
    return len(total_cases)

total_problems = total_problem_cases(df)
print(f"Total problem cases: {total_problems}")

Total problem cases: 22


#### Total problem cases impacted by one or more of G to K | M

In [39]:
def check_identical_cases(df):
    return identical_cases

def check_inconsistent_cases(df):
    # Find inconsistent cases directly
    inconsistent_cases = df[df.duplicated(subset=features, keep=False)] \
        .groupby(list(features)).filter(lambda x: x[target].nunique() > 1)

    # Return the indices or cases that are inconsistent
    return set(inconsistent_cases.index)

def check_missing_values(df):
    return cases_with_missing_values

def check_implausible_values(df):
    implausible_cases = set()

    # Check for LOC_TOTAL being 0
    loc_total_zero = df[df['LOC_TOTAL'] == 0]
    if not loc_total_zero.empty:
        print("Rows where LOC_TOTAL is 0:")
        print(loc_total_zero)
        implausible_features.add('LOC_TOTAL')

    # Check for any attribute having a value less than 0
    negative_values = df[df < 0].dropna()
    if not negative_values.empty:
        print("Rows with negative values:")
        print(negative_values)
        implausible_features.update(negative_values.columns)

    # Check for any count being a non-integer
    non_integer_counts = df[df.applymap(lambda x: not isinstance(x, int))].dropna()
    if not non_integer_counts.empty:
        print("Rows with non-integer counts:")
        print(non_integer_counts)
        implausible_features.update(non_integer_counts.columns)

    return implausible_cases

# Function to calculate total problem features (Condition F)
def total_problem_cases(df):
    total_cases = set()
    total_cases.update(check_identical_cases(df))
    total_cases.update(check_inconsistent_cases(df))
    total_cases.update(check_missing_values(df))
    total_cases.update(check_conflicting_values(df))
    total_cases.update(check_implausible_values(df))
    return len(total_cases)

# Calculate and print total problem features
total_problems = total_problem_cases(df)
print(f"Total problem cases: {total_problems}")

Total problem cases: 911


In [19]:
# Rule G: Identical cases
identical_cases = df[df.duplicated(keep=False)]
identical_cases_count = len(identical_cases)

# Rule H: Inconsistent cases
inconsistent_cases = df[df.duplicated(subset=df.columns[:-1], keep=False) & df.duplicated(subset=df.columns, keep=False)]
inconsistent_cases_count = len(inconsistent_cases)

# Rule I: Cases with missing values
cases_with_missing_values = df[df.isnull().any(axis=1)]
cases_with_missing_values_count = len(cases_with_missing_values)

# Rule K: Cases with implausible values (based on Column E)
implausible_cases = df[df.apply(lambda x: (x < 0).any(), axis=1)]
implausible_cases_count = len(implausible_cases)

# Rule M: Total problem cases according to G to K
total_problem_cases = set(identical_cases.index) | set(inconsistent_cases.index) | set(cases_with_missing_values.index) | set(implausible_cases.index)
total_problem_cases_count = len(total_problem_cases)

print(f"Identical cases (G): {identical_cases_count}")
print(f"Inconsistent cases (H): {inconsistent_cases_count}")
print(f"Cases with missing values (I): {cases_with_missing_values_count}")
print(f"Cases with implausible values (K): {implausible_cases_count}")
print(f"Total problem cases (M): {total_problem_cases_count}")

Identical cases (G): 2628
Inconsistent cases (H): 2628
Cases with missing values (I): 0
Cases with implausible values (K): 0
Total problem cases (M): 2628


### Cleaning

* Combining all cases and features to remove from the dataset.

In [20]:
# Combine all cases to remove
cases_to_remove = (
    loc_total_zero.index.union(negative_values.index).union(non_integer_counts.index)
    .union(identical_cases.index)
    .union(inconsistent_cases.index)
    .union(cases_with_missing_values.index)
)

# Combine all features to remove
features_to_remove = constant_features.union(identical_features)

# Remove cases
df_cleaned = df.drop(cases_to_remove)

# Remove features
df_cleaned = df_cleaned.drop(columns=features_to_remove)

# Display the cleaned DataFrame
print("DataFrame after removing cases and features:")
df_cleaned

DataFrame after removing cases and features:


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label
0,447.0,826.0,12.0,157.0,470.0,385.0,113.0,2824.0,210.28,384.45,31079782.27,26.95,8441.0,0.00,1726654.57,80843.08,3021.0,5420.0,609.0,155.0,3442.0,1
1,0.0,211.0,0.0,0.0,128.0,104.0,14.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,1129.0,1
2,164.0,485.0,10.0,58.0,268.0,219.0,39.0,1588.0,202.98,213.53,9254819.86,14.45,4828.0,0.00,514156.64,43342.31,1730.0,3172.0,407.0,102.0,1824.0,1
3,37.0,29.0,8.0,42.0,19.0,19.0,6.0,133.0,108.14,46.32,232043.52,1.67,685.0,0.02,12891.31,5009.32,295.0,390.0,121.0,38.0,222.0,1
4,11.0,405.0,0.0,17.0,404.0,2.0,1.0,814.0,101.20,206.01,4294926.45,6.95,2033.0,0.00,238607.05,20848.47,813.0,1220.0,811.0,411.0,844.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10873,2.0,7.0,0.0,0.0,4.0,4.0,1.0,13.0,32.93,7.33,1770.86,0.08,52.0,0.14,98.38,241.48,22.0,30.0,15.0,10.0,18.0,0
10874,2.0,3.0,0.0,0.0,2.0,2.0,1.0,5.0,15.72,8.25,1069.68,0.04,30.0,0.12,59.43,129.66,11.0,19.0,8.0,12.0,9.0,0
10875,10.0,7.0,0.0,1.0,4.0,2.0,1.0,29.0,19.68,26.40,13716.72,0.17,103.0,0.04,762.04,519.57,44.0,59.0,15.0,18.0,42.0,0
10876,2.0,1.0,0.0,0.0,1.0,1.0,1.0,6.0,17.44,8.44,1241.57,0.05,36.0,0.12,68.98,147.15,15.0,21.0,8.0,9.0,10.0,0


In [21]:
df_cleaned.to_pickle(ROOT+"data/cleaned.pkl")