## Required Data and Libraries

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import fetch_covtype

def load_and_inspect_covertype():
    """
    Fetches the Covertype dataset from scikit-learn and provides a brief inspection.
    """
    print("--- Fetching the Covertype Dataset ---")

    try:
        # fetch_covtype() downloads the dataset and caches it.
        # It returns a Bunch object, which is a dictionary-like object.
        covtype_data = fetch_covtype()

        # Print the detailed description of the dataset.
        # This includes information about the features, target, and data source.
        print("\n--- Dataset Description ---")
        print(covtype_data.DESCR)

        # The feature data is stored in the 'data' attribute.
        X = pd.DataFrame(covtype_data.data, columns=covtype_data.feature_names)

        # The target variable (Cover_Type) is in the 'target' attribute.
        y = pd.Series(covtype_data.target, name="Cover_Type")

        # Display the first 5 rows of the feature data.
        print("\n--- Features (First 5 rows) ---")
        print(X.head())

        # Display the first 5 rows of the target data.
        print("\n--- Target (First 5 rows) ---")
        print(y.head())

        # Display the shape of the features and target data.
        print("\n--- Dataset Shape ---")
        print(f"Features shape: {X.shape}")
        print(f"Target shape: {y.shape}")

        # Return the data and target for further use, e.g., model training
        return X, y

    except Exception as e:
        print(f"An error occurred while fetching the dataset: {e}")
        return None, None

if __name__ == "__main__":
    # Call the function to load and inspect the dataset
    features, target = load_and_inspect_covertype()
    
    if features is not None and target is not None:
        print("\nDataset successfully loaded and ready for use!")

--- Fetching the Covertype Dataset ---

--- Dataset Description ---
.. _covtype_dataset:

Forest covertypes
-----------------

The samples in this dataset correspond to 30×30m patches of forest in the US,
collected for the task of predicting each patch's cover type,
i.e. the dominant species of tree.
There are seven covertypes, making this a multiclass classification problem.
Each sample has 54 features, described on the
`dataset's homepage <https://archive.ics.uci.edu/ml/datasets/Covertype>`__.
Some of the features are boolean indicators,
while others are discrete or continuous measurements.

**Data Set Characteristics:**

Classes                        7
Samples total             581012
Dimensionality                54
Features                     int

:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
it returns a dictionary-like 'Bunch' object
with the feature matrix in the ``data`` member
and the target values in ``target``. If optional argument 'as_frame' is


In [24]:
# Load the dataset and create a single DataFrame
print("Loading Covertype dataset...")

# Call the function to get features and target
features, target = load_and_inspect_covertype()

if features is not None and target is not None:
    # Combine features and target into a single DataFrame
    df = pd.concat([features, target], axis=1)
    
    print("\n=== SINGLE DATAFRAME CREATED ===")
    print(f"DataFrame shape: {df.shape}")
    print(f"Total columns: {len(df.columns)}")
    print(f"Total rows: {len(df)}")
    
    print("\n--- Column Names ---")
    print(df.columns.tolist())
    
    print("\n--- First 5 rows of combined DataFrame ---")
    print(df.head())
    
    print("\n--- Data Types ---")
    print(df.dtypes)
    
    print("\n--- Basic Statistics ---")
    print(df.describe())
    
    print("\n--- Target Distribution ---")
    print(df['Cover_Type'].value_counts().sort_index())
    
    print("\n--- Missing Values Check ---")
    print(df.isnull().sum().sum(), "total missing values")
    
    print("\nSingle DataFrame 'df' is ready for use!")
    
else:
    print("Failed to load the dataset. Please check the error above.")

Loading Covertype dataset...
--- Fetching the Covertype Dataset ---



--- Dataset Description ---
.. _covtype_dataset:

Forest covertypes
-----------------

The samples in this dataset correspond to 30×30m patches of forest in the US,
collected for the task of predicting each patch's cover type,
i.e. the dominant species of tree.
There are seven covertypes, making this a multiclass classification problem.
Each sample has 54 features, described on the
`dataset's homepage <https://archive.ics.uci.edu/ml/datasets/Covertype>`__.
Some of the features are boolean indicators,
while others are discrete or continuous measurements.

**Data Set Characteristics:**

Classes                        7
Samples total             581012
Dimensionality                54
Features                     int

:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
it returns a dictionary-like 'Bunch' object
with the feature matrix in the ``data`` member
and the target values in ``target``. If optional argument 'as_frame' is
set to 'True', it will return ``data`` 

## Shift data to Neo4j Db



In [None]:
from neo4j import GraphDatabase

def shift_data_to_neo4j(uri, user, password, dataframe):
    """
    Shifts ALL data from a pandas DataFrame to a Neo4j database.
    Creates nodes with ALL features from the Covertype dataset.

    Parameters:
    uri (str): URI of the Neo4j database.
    user (str): Username for the Neo4j database.
    password (str): Password for the Neo4j database.
    dataframe (pd.DataFrame): DataFrame containing the data to be inserted.
    """
    try:
        print("Connecting to Neo4j database...")
        driver = GraphDatabase.driver(uri, auth=(user, password))
        
        with driver.session() as session:
            # Clear existing data
            print("Clearing existing data...")
            session.run("MATCH (n) DETACH DELETE n")
            
            total_rows = len(dataframe)
            batch_size = 1000  # Process in batches for better performance
            
            for i in range(0, total_rows, batch_size):
                batch_end = min(i + batch_size, total_rows)
                batch = dataframe.iloc[i:batch_end]
                
                # Create batch transaction
                with session.begin_transaction() as tx:
                    for index, row in batch.iterrows():
                        # Create a comprehensive node with ALL features
                        query = """
                        CREATE (f:ForestCover {
                            record_id: $record_id,
                            Elevation: $Elevation,
                            Aspect: $Aspect,
                            Slope: $Slope,
                            Horizontal_Distance_To_Hydrology: $Horizontal_Distance_To_Hydrology,
                            Vertical_Distance_To_Hydrology: $Vertical_Distance_To_Hydrology,
                            Horizontal_Distance_To_Roadways: $Horizontal_Distance_To_Roadways,
                            Hillshade_9am: $Hillshade_9am,
                            Hillshade_Noon: $Hillshade_Noon,
                            Hillshade_3pm: $Hillshade_3pm,
                            Horizontal_Distance_To_Fire_Points: $Horizontal_Distance_To_Fire_Points
                        """
                        
                        # Add wilderness area features (0-indexed)
                        for j in range(0, 4):
                            query += f", Wilderness_Area_{j}: $Wilderness_Area_{j}"
                        
                        # Add soil type features (0-indexed)  
                        for j in range(0, 40):
                            query += f", Soil_Type_{j}: $Soil_Type_{j}"
                        
                        query += ", Cover_Type: $Cover_Type })"
                        
                        # Prepare parameters dictionary
                        params = {"record_id": int(index)}
                        for col in dataframe.columns:
                            params[col] = int(row[col]) if col in dataframe.select_dtypes(include=['int64']).columns else float(row[col])
                        
                        tx.run(query, params)
                    
                    tx.commit()
                
                print(f"Processed {batch_end}/{total_rows} records ({(batch_end/total_rows)*100:.1f}%)")
        
        # Verify the data transfer
        with driver.session() as session:
            result = session.run("MATCH (f:ForestCover) RETURN count(f) as total_nodes")
            total_nodes = result.single()["total_nodes"]
            print(f"\n✅ SUCCESS: {total_nodes} nodes created in Neo4j!")
            
            # Show sample data
            sample = session.run("MATCH (f:ForestCover) RETURN f LIMIT 3")
            print("\n📊 Sample nodes:")
            for record in sample:
                node = record["f"]
                print(f"  - Elevation: {node['Elevation']}, Cover_Type: {node['Cover_Type']}")
        
    except Exception as e:
        print(f"❌ ERROR: {e}")
        print("Make sure Neo4j is running and credentials are correct!")
    finally:
        driver.close()
        print("Neo4j connection closed.")

# Neo4j Database Configuration
uri = "neo4j://127.0.0.1:7687"  # Your Neo4j URI
user = "neo4j"  # Your Neo4j username  
password = "rm#123456"  # Your Neo4j password

print("=== SHIFTING DATA TO NEO4J DATABASE ===")
print(f"URI: {uri}")
print(f"User: {user}")
print(f"DataFrame shape: {df.shape}")
print(f"Total records to transfer: {len(df)}")

# Shift the entire dataset to Neo4j
shift_data_to_neo4j(uri, user, password, df)

=== SHIFTING DATA TO NEO4J DATABASE ===
URI: neo4j://127.0.0.1:7687
User: neo4j
DataFrame shape: (581012, 55)
Total records to transfer: 581012
Connecting to Neo4j database...
Clearing existing data...
Processed 1000/581012 records (0.2%)
Processed 1000/581012 records (0.2%)
Processed 2000/581012 records (0.3%)
Processed 2000/581012 records (0.3%)
Processed 3000/581012 records (0.5%)
Processed 3000/581012 records (0.5%)
Processed 4000/581012 records (0.7%)
Processed 4000/581012 records (0.7%)


In [None]:
# Python functions to query Neo4j data
from neo4j import GraphDatabase
import pandas as pd

def query_neo4j_data(uri, user, password, query):
    """
    Execute a Cypher query and return results as a DataFrame
    """
    driver = GraphDatabase.driver(uri, auth=(user, password))
    
    try:
        with driver.session() as session:
            result = session.run(query)
            # Convert to list of dictionaries
            records = [record.data() for record in result]
            # Convert to DataFrame
            return pd.DataFrame(records)
    finally:
        driver.close()

# Example queries you can run
def get_sample_data():
    """Get first 10 records"""
    query = "MATCH (f:ForestCover) RETURN f.Elevation, f.Aspect, f.Cover_Type LIMIT 10"
    return query_neo4j_data(uri, user, password, query)

def get_cover_type_stats():
    """Get statistics by cover type"""
    query = """
    MATCH (f:ForestCover) 
    RETURN f.Cover_Type as cover_type, 
           count(*) as count,
           avg(f.Elevation) as avg_elevation,
           avg(f.Slope) as avg_slope
    ORDER BY f.Cover_Type
    """
    return query_neo4j_data(uri, user, password, query)

def find_high_elevation_forests():
    """Find forests above 3000m elevation"""
    query = """
    MATCH (f:ForestCover) 
    WHERE f.Elevation > 3000 
    RETURN f.Elevation, f.Cover_Type, f.Aspect, f.Slope 
    ORDER BY f.Elevation DESC 
    LIMIT 20
    """
    return query_neo4j_data(uri, user, password, query)

def get_wilderness_distribution():
    """Get distribution across wilderness areas"""
    query = """
    MATCH (f:ForestCover) 
    RETURN 
        sum(f.Wilderness_Area_0) as wilderness_0_count,
        sum(f.Wilderness_Area_1) as wilderness_1_count,
        sum(f.Wilderness_Area_2) as wilderness_2_count,
        sum(f.Wilderness_Area_3) as wilderness_3_count
    """
    return query_neo4j_data(uri, user, password, query)

# Test the connection and run sample queries
print("=== ACCESSING NEO4J DATA ===")
print("Neo4j connection details:")
print(f"URI: {uri}")
print(f"User: {user}")
print()

try:
    # Test 1: Get sample data
    print("1. Sample data (first 10 records):")
    sample_df = get_sample_data()
    print(sample_df)
    print()
    
    # Test 2: Cover type statistics
    print("2. Statistics by Cover Type:")
    stats_df = get_cover_type_stats()
    print(stats_df)
    print()
    
    # Test 3: High elevation forests
    print("3. High elevation forests (>3000m):")
    high_elev_df = find_high_elevation_forests()
    print(high_elev_df.head())
    print()
    
    # Test 4: Wilderness area distribution
    print("4. Wilderness area distribution:")
    wilderness_df = get_wilderness_distribution()
    print(wilderness_df)
    
except Exception as e:
    print(f"Error accessing Neo4j: {e}")
    print("Make sure Neo4j is running and data has been loaded!")

In [None]:
# Check the actual column names to fix Neo4j parameter issue
print("DataFrame columns:")
print(df.columns.tolist())
print(f"\nTotal columns: {len(df.columns)}")

# Check specifically for wilderness and soil columns
wilderness_cols = [col for col in df.columns if 'Wilderness' in col]
soil_cols = [col for col in df.columns if 'Soil' in col]

print(f"\nWilderness columns: {wilderness_cols}")
print(f"Soil columns: {soil_cols}")
print(f"Number of soil columns: {len(soil_cols)}")

# Check data types
print(f"\nData types:")
print(df.dtypes)

DataFrame columns:
['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area_0', 'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3', 'Soil_Type_0', 'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3', 'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7', 'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11', 'Soil_Type_12', 'Soil_Type_13', 'Soil_Type_14', 'Soil_Type_15', 'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19', 'Soil_Type_20', 'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23', 'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27', 'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31', 'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35', 'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39', 'Cover_Type']

Total columns: 55

Wilde

In [None]:
# Load all data from Neo4j into DataFrame named df2
print("=== LOADING DATA FROM NEO4J INTO df2 ===")

# Define comprehensive query to fetch ALL data from Neo4j
neo4j_query = """
MATCH (f:ForestCover)
RETURN 
    f.Elevation AS Elevation,
    f.Aspect AS Aspect,
    f.Slope AS Slope,
    f.Horizontal_Distance_To_Hydrology AS Horizontal_Distance_To_Hydrology,
    f.Vertical_Distance_To_Hydrology AS Vertical_Distance_To_Hydrology,
    f.Horizontal_Distance_To_Roadways AS Horizontal_Distance_To_Roadways,
    f.Hillshade_9am AS Hillshade_9am,
    f.Hillshade_Noon AS Hillshade_Noon,
    f.Hillshade_3pm AS Hillshade_3pm,
    f.Horizontal_Distance_To_Fire_Points AS Horizontal_Distance_To_Fire_Points,
    f.Wilderness_Area_0 AS Wilderness_Area_0,
    f.Wilderness_Area_1 AS Wilderness_Area_1,
    f.Wilderness_Area_2 AS Wilderness_Area_2,
    f.Wilderness_Area_3 AS Wilderness_Area_3,
    f.Soil_Type_0 AS Soil_Type_0,
    f.Soil_Type_1 AS Soil_Type_1,
    f.Soil_Type_2 AS Soil_Type_2,
    f.Soil_Type_3 AS Soil_Type_3,
    f.Soil_Type_4 AS Soil_Type_4,
    f.Soil_Type_5 AS Soil_Type_5,
    f.Soil_Type_6 AS Soil_Type_6,
    f.Soil_Type_7 AS Soil_Type_7,
    f.Soil_Type_8 AS Soil_Type_8,
    f.Soil_Type_9 AS Soil_Type_9,
    f.Soil_Type_10 AS Soil_Type_10,
    f.Soil_Type_11 AS Soil_Type_11,
    f.Soil_Type_12 AS Soil_Type_12,
    f.Soil_Type_13 AS Soil_Type_13,
    f.Soil_Type_14 AS Soil_Type_14,
    f.Soil_Type_15 AS Soil_Type_15,
    f.Soil_Type_16 AS Soil_Type_16,
    f.Soil_Type_17 AS Soil_Type_17,
    f.Soil_Type_18 AS Soil_Type_18,
    f.Soil_Type_19 AS Soil_Type_19,
    f.Soil_Type_20 AS Soil_Type_20,
    f.Soil_Type_21 AS Soil_Type_21,
    f.Soil_Type_22 AS Soil_Type_22,
    f.Soil_Type_23 AS Soil_Type_23,
    f.Soil_Type_24 AS Soil_Type_24,
    f.Soil_Type_25 AS Soil_Type_25,
    f.Soil_Type_26 AS Soil_Type_26,
    f.Soil_Type_27 AS Soil_Type_27,
    f.Soil_Type_28 AS Soil_Type_28,
    f.Soil_Type_29 AS Soil_Type_29,
    f.Soil_Type_30 AS Soil_Type_30,
    f.Soil_Type_31 AS Soil_Type_31,
    f.Soil_Type_32 AS Soil_Type_32,
    f.Soil_Type_33 AS Soil_Type_33,
    f.Soil_Type_34 AS Soil_Type_34,
    f.Soil_Type_35 AS Soil_Type_35,
    f.Soil_Type_36 AS Soil_Type_36,
    f.Soil_Type_37 AS Soil_Type_37,
    f.Soil_Type_38 AS Soil_Type_38,
    f.Soil_Type_39 AS Soil_Type_39,
    f.Cover_Type AS Cover_Type
ORDER BY f.record_id
"""

try:
    print("Connecting to Neo4j and fetching data...")
    print(f"URI: {uri}")
    print(f"User: {user}")
    
    # Load data from Neo4j into df2
    df2 = query_neo4j_data(uri, user, password, neo4j_query)
    
    print(f"\n✅ SUCCESS: Data loaded into df2!")
    print(f"DataFrame shape: {df2.shape}")
    print(f"Total rows: {len(df2)}")
    print(f"Total columns: {len(df2.columns)}")
    
    print("\n--- df2 Column Names ---")
    print(df2.columns.tolist())
    
    print("\n--- First 5 rows of df2 ---")
    print(df2.head())
    
    print("\n--- Data Types in df2 ---")
    print(df2.dtypes)
    
    print("\n--- Basic Statistics for df2 ---")
    print(df2.describe())
    
    print("\n--- df2 Cover Type Distribution ---")
    if 'Cover_Type' in df2.columns:
        print(df2['Cover_Type'].value_counts().sort_index())
    
    print("\n--- Missing Values in df2 ---")
    missing_values = df2.isnull().sum().sum()
    print(f"Total missing values: {missing_values}")
    
    print(f"\n🎉 DataFrame 'df2' is ready for use!")
    print(f"You now have your forest cover data loaded from Neo4j in df2")
    
except Exception as e:
    print(f"❌ ERROR loading data from Neo4j: {e}")
    print("Possible issues:")
    print("1. Neo4j is not running")
    print("2. No data in Neo4j database")
    print("3. Connection credentials are incorrect")
    print("4. Run cell 6 first to load data into Neo4j")

=== LOADING DATA FROM NEO4J INTO df2 ===
Connecting to Neo4j and fetching data...
URI: neo4j://127.0.0.1:7687
User: neo4j
❌ ERROR loading data from Neo4j: name 'query_neo4j_data' is not defined
Possible issues:
1. Neo4j is not running
2. No data in Neo4j database
3. Connection credentials are incorrect
4. Run cell 6 first to load data into Neo4j
